From f38294e729109744c956b6c42ca1a63d313aa4ce Mon Sep 17 00:00:00 2001 From: Rich Hankins Date: Mon, 15 Dec 2025 22:11:05 +0000 Subject: [PATCH 1/8] Add github workflow --- .github/workflows/index.yml | 80 ++++++++ src/__init__.py | 24 +++ src/file_filter.py | 123 +++++++++++ src/github_client.py | 307 ++++++++++++++++++++++++++++ src/index_manager.py | 395 ++++++++++++++++++++++++++++++++++++ src/main.py | 167 +++++++++++++++ src/models.py | 131 ++++++++++++ src/search.py | 132 ++++++++++++ 8 files changed, 1359 insertions(+) create mode 100644 .github/workflows/index.yml create mode 100644 src/__init__.py create mode 100644 src/file_filter.py create mode 100644 src/github_client.py create mode 100644 src/index_manager.py create mode 100644 src/main.py create mode 100644 src/models.py create mode 100644 src/search.py diff --git a/.github/workflows/index.yml b/.github/workflows/index.yml new file mode 100644 index 0000000..43c349a --- /dev/null +++ b/.github/workflows/index.yml @@ -0,0 +1,80 @@ +name: Index Repository + +on: + push: + branches: + - main + - develop + - 'feature/**' # Index feature branches + - 'release/**' # Index release branches + workflow_dispatch: + inputs: + branch: + description: 'Branch to index (leave empty for current branch)' + required: false + type: string + force_full_reindex: + description: 'Force full re-index' + required: false + type: boolean + default: false + +jobs: + index: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 # Full history for comparison + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: '3.11' + cache: 'pip' + + - name: Install dependencies + run: pip install -r requirements.txt + + - name: Restore index state + uses: actions/cache@v4 + with: + path: .augment-index-state + # Use branch-specific cache key + key: augment-index-${{ github.ref_name }}-${{ github.sha }} + restore-keys: | + augment-index-${{ github.ref_name }}- + + - name: Index repository + id: index + run: python src/main.py + env: + AUGMENT_API_TOKEN: ${{ secrets.AUGMENT_API_TOKEN }} + AUGMENT_API_URL: ${{ secrets.AUGMENT_API_URL }} + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + STORAGE_TYPE: file + # Branch-specific state path (automatically determined from GITHUB_REF) + # STATE_PATH is optional - defaults to .augment-index-state/{branch}/state.json + MAX_COMMITS: 100 + MAX_FILES: 500 + + - name: Print results + if: always() + run: | + echo "Success: ${{ steps.index.outputs.success }}" + echo "Type: ${{ steps.index.outputs.type }}" + echo "Files Indexed: ${{ steps.index.outputs.files_indexed }}" + echo "Files Deleted: ${{ steps.index.outputs.files_deleted }}" + echo "Checkpoint ID: ${{ steps.index.outputs.checkpoint_id }}" + echo "Commit SHA: ${{ steps.index.outputs.commit_sha }}" + + - name: Upload state artifact + if: success() + uses: actions/upload-artifact@v4 + with: + name: index-state + path: .augment-index-state/ + retention-days: 30 + diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..499dfe6 --- /dev/null +++ b/src/__init__.py @@ -0,0 +1,24 @@ +""" +GitHub Action Repository Indexer + +A Python example showing how to index a GitHub repository using the Augment SDK +Direct Mode with incremental updates. + +See README.md for usage instructions. +""" + +from .models import FileChange, IndexConfig, IndexResult, IndexState +from .file_filter import should_filter_file +from .github_client import GitHubClient +from .index_manager import IndexManager + +__all__ = [ + "FileChange", + "IndexConfig", + "IndexResult", + "IndexState", + "should_filter_file", + "GitHubClient", + "IndexManager", +] + diff --git a/src/file_filter.py b/src/file_filter.py new file mode 100644 index 0000000..88ab035 --- /dev/null +++ b/src/file_filter.py @@ -0,0 +1,123 @@ +""" +File filtering logic for GitHub repository indexing. +""" + +import re +from pathlib import Path +from typing import Optional + +# Keyish pattern regex - matches files that likely contain secrets/keys +KEYISH_PATTERN = re.compile( + r'^(\.git|.*\.pem|.*\.key|.*\.pfx|.*\.p12|.*\.jks|.*\.keystore|.*\.pkcs12|.*\.crt|.*\.cer|id_rsa|id_ed25519|id_ecdsa|id_dsa)$' +) + +# Default max file size in bytes (1 MB) +DEFAULT_MAX_FILE_SIZE = 1024 * 1024 # 1 MB + + +def always_ignore_path(path: str) -> bool: + """ + Check if a path should always be ignored (security measure). + + Args: + path: The file path to check. + + Returns: + True if the path contains ".." and should be ignored. + """ + return ".." in path + + +def is_keyish_path(path: str) -> bool: + """ + Check if a path matches the keyish pattern (secrets/keys). + + Args: + path: The file path to check. + + Returns: + True if the filename matches patterns for secret/key files. + """ + # Extract filename from path + filename = Path(path).name + return bool(KEYISH_PATTERN.match(filename)) + + +def is_valid_file_size(size_bytes: int, max_file_size: int = DEFAULT_MAX_FILE_SIZE) -> bool: + """ + Check if file size is valid for upload. + + Args: + size_bytes: The size of the file in bytes. + max_file_size: Maximum allowed file size in bytes. Defaults to 1 MB. + + Returns: + True if the file size is within the allowed limit. + """ + return size_bytes <= max_file_size + + +def is_valid_utf8(content: bytes) -> bool: + """ + Check if file content is valid UTF-8 (not binary). + + Args: + content: The file content as bytes. + + Returns: + True if the content is valid UTF-8, False if it's binary or invalid. + """ + try: + content.decode("utf-8") + return True + except UnicodeDecodeError: + return False + + +def should_filter_file( + path: str, + content: bytes, + max_file_size: Optional[int] = None, +) -> dict: + """ + Check if a file should be filtered out. + + Returns {"filtered": True, "reason": "..."} if file should be skipped. + Returns {"filtered": False} if file should be included. + + Priority order (from file-filtering.md): + 1. Path validation (contains "..") + 2. File size check + 3. .augmentignore rules (checked by caller) + 4. Keyish patterns + 5. .gitignore rules (checked by caller) + 6. UTF-8 validation + + Args: + path: The file path to check. + content: The file content as bytes. + max_file_size: Maximum allowed file size in bytes. Defaults to DEFAULT_MAX_FILE_SIZE. + + Returns: + A dict with "filtered" (bool) and optionally "reason" (str) keys. + """ + effective_max_size = max_file_size if max_file_size is not None else DEFAULT_MAX_FILE_SIZE + + # 1. Check for ".." in path (security) + if always_ignore_path(path): + return {"filtered": True, "reason": "path_contains_dotdot"} + + # 2. Check file size + if not is_valid_file_size(len(content), effective_max_size): + return {"filtered": True, "reason": f"file_too_large ({len(content)} bytes)"} + + # 3. Check keyish patterns (secrets/keys) + if is_keyish_path(path): + return {"filtered": True, "reason": "keyish_pattern"} + + # 4. Check UTF-8 validity (binary detection) + if not is_valid_utf8(content): + return {"filtered": True, "reason": "binary_file"} + + return {"filtered": False} + diff --git a/src/github_client.py b/src/github_client.py new file mode 100644 index 0000000..f69bd62 --- /dev/null +++ b/src/github_client.py @@ -0,0 +1,307 @@ +""" +GitHub API client for fetching repository data. +""" + +import io +import tarfile + +import pathspec +import requests +from github import Github +from github.GithubException import GithubException + +from .file_filter import should_filter_file +from .models import FileChange + + +class GitHubClient: + """GitHub API client for fetching repository data.""" + + def __init__(self, token: str) -> None: + """ + Initialize the GitHub client with an authentication token. + + Args: + token: GitHub personal access token or GitHub App token. + """ + self._github = Github(token) + self._token = token + + def resolve_ref(self, owner: str, repo: str, ref: str) -> str: + """ + Resolve a ref (like "HEAD", "main", or a commit SHA) to a commit SHA. + + Args: + owner: Repository owner. + repo: Repository name. + ref: Git ref to resolve. + + Returns: + The full 40-character commit SHA. + + Raises: + Exception: If the ref cannot be resolved. + """ + try: + repository = self._github.get_repo(f"{owner}/{repo}") + commit = repository.get_commit(ref) + return commit.sha + except GithubException as error: + raise Exception( + f'Failed to resolve ref "{ref}" for {owner}/{repo}: {error}' + ) from error + + def download_tarball(self, owner: str, repo: str, ref: str) -> dict[str, str]: + """ + Download repository as tarball and extract files. + + Args: + owner: Repository owner. + repo: Repository name. + ref: Git ref to download. + + Returns: + Dictionary mapping file paths to their contents. + """ + print(f"Downloading tarball for {owner}/{repo}@{ref}...") + + repository = self._github.get_repo(f"{owner}/{repo}") + tarball_url = repository.get_archive_link("tarball", ref) + + # Download tarball (10 minute timeout to handle large repositories) + # Include auth header for private repos + headers = {"Authorization": f"Bearer {self._token}"} + response = requests.get(tarball_url, headers=headers, stream=True, timeout=600) + if not response.ok: + raise Exception(f"Failed to download tarball: {response.reason}") + + # Load ignore patterns + augmentignore, gitignore = self._load_ignore_patterns(owner, repo, ref) + + # Track filtering statistics + files: dict[str, str] = {} + total_files = 0 + filtered_files = 0 + filter_reasons: dict[str, int] = {} + + # Extract files from tarball + tarball_data = io.BytesIO(response.content) + with tarfile.open(fileobj=tarball_data, mode="r:gz") as tar: + for member in tar.getmembers(): + # Skip directories and symlinks + if not member.isfile(): + continue + + total_files += 1 + + # Remove the root directory prefix (e.g., "owner-repo-sha/") + path_parts = member.name.split("/") + path_parts.pop(0) # Remove first component + file_path = "/".join(path_parts) + + if not file_path: + continue + + # Read file contents + file_obj = tar.extractfile(member) + if file_obj is None: + continue + content_bytes = file_obj.read() + + # Apply filtering in priority order: + # 1. .augmentignore + if augmentignore and augmentignore.match_file(file_path): + filtered_files += 1 + filter_reasons["augmentignore"] = filter_reasons.get("augmentignore", 0) + 1 + continue + + # 2. Path validation, file size, keyish patterns, UTF-8 validation + filter_result = should_filter_file(path=file_path, content=content_bytes) + + if filter_result["filtered"]: + filtered_files += 1 + reason = filter_result.get("reason", "unknown") + filter_reasons[reason] = filter_reasons.get(reason, 0) + 1 + continue + + # 3. .gitignore (checked last) + if gitignore and gitignore.match_file(file_path): + filtered_files += 1 + filter_reasons["gitignore"] = filter_reasons.get("gitignore", 0) + 1 + continue + + # File passed all filters + try: + contents = content_bytes.decode("utf-8") + files[file_path] = contents + except UnicodeDecodeError: + # This should not happen if is_valid_utf8() is working correctly + filtered_files += 1 + filter_reasons["decode_error"] = filter_reasons.get("decode_error", 0) + 1 + print(f"Warning: File {file_path} passed UTF-8 validation but failed to decode") + + print(f"Extracted {len(files)} files from tarball") + print(f"Filtered {filtered_files} of {total_files} files. Reasons: {filter_reasons}") + return files + + def compare_commits( + self, owner: str, repo: str, base: str, head: str + ) -> dict: + """ + Compare two commits and get file changes. + """ + print(f"Comparing {base}...{head}...") + + repository = self._github.get_repo(f"{owner}/{repo}") + comparison = repository.compare(base, head) + + files: list[FileChange] = [] + + for file in comparison.files: + change = FileChange( + path=file.filename, + status=self._map_github_status(file.status), + previousFilename=file.previous_filename, + ) + + # Download file contents for added/modified files + if change.status in ("added", "modified"): + try: + contents = self.get_file_contents(owner, repo, file.filename, head) + change.contents = contents + except Exception as error: + print(f"Warning: Failed to download {file.filename}: {error}") + + files.append(change) + + return { + "files": files, + "commits": comparison.total_commits, + "totalChanges": len(comparison.files), + } + + def get_file_contents( + self, owner: str, repo: str, path: str, ref: str + ) -> str: + """ + Get file contents at a specific ref. + + Args: + owner: Repository owner. + repo: Repository name. + path: File path within the repository. + ref: Git ref to get contents at. + + Returns: + The file contents as a string. + + Raises: + Exception: If the path is not a file. + """ + repository = self._github.get_repo(f"{owner}/{repo}") + content = repository.get_contents(path, ref) + + if isinstance(content, list): + raise Exception(f"{path} is not a file") + + return content.decoded_content.decode("utf-8") + + def _load_ignore_patterns( + self, owner: str, repo: str, ref: str + ) -> tuple[pathspec.PathSpec | None, pathspec.PathSpec | None]: + """ + Load .gitignore and .augmentignore patterns separately. + + Returns both filters to maintain proper priority order: + .augmentignore → keyish → .gitignore + + Args: + owner: Repository owner. + repo: Repository name. + ref: Git ref to load patterns from. + + Returns: + Tuple of (augmentignore, gitignore) PathSpec objects, or None if not found. + """ + augmentignore: pathspec.PathSpec | None = None + gitignore: pathspec.PathSpec | None = None + + # Try to load .gitignore + try: + gitignore_content = self.get_file_contents(owner, repo, ".gitignore", ref) + gitignore = pathspec.PathSpec.from_lines("gitwildmatch", gitignore_content.splitlines()) + except Exception: + # .gitignore doesn't exist + pass + + # Try to load .augmentignore + try: + augmentignore_content = self.get_file_contents(owner, repo, ".augmentignore", ref) + augmentignore = pathspec.PathSpec.from_lines("gitwildmatch", augmentignore_content.splitlines()) + except Exception: + # .augmentignore doesn't exist + pass + + return augmentignore, gitignore + + def _map_github_status(self, status: str) -> str: + """ + Map GitHub file status to our FileChange status. + + Args: + status: GitHub file status string. + + Returns: + Normalized status string. + """ + status_map = { + "added": "added", + "modified": "modified", + "removed": "removed", + "renamed": "renamed", + } + return status_map.get(status, "modified") + + def ignore_files_changed( + self, owner: str, repo: str, base: str, head: str + ) -> bool: + """ + Check if ignore files changed between commits. + + Args: + owner: Repository owner. + repo: Repository name. + base: Base commit SHA. + head: Head commit SHA. + + Returns: + True if .gitignore or .augmentignore changed, False otherwise. + """ + repository = self._github.get_repo(f"{owner}/{repo}") + comparison = repository.compare(base, head) + + ignore_files = [".gitignore", ".augmentignore"] + return any(file.filename in ignore_files for file in comparison.files) + + def is_force_push( + self, owner: str, repo: str, base: str, head: str + ) -> bool: + """ + Check if the push was a force push. + + Args: + owner: Repository owner. + repo: Repository name. + base: Base commit SHA. + head: Head commit SHA. + + Returns: + True if the push was a force push, False otherwise. + """ + try: + repository = self._github.get_repo(f"{owner}/{repo}") + repository.compare(base, head) + return False + except GithubException: + # If comparison fails, it's likely a force push + return True diff --git a/src/index_manager.py b/src/index_manager.py new file mode 100644 index 0000000..c2bf48f --- /dev/null +++ b/src/index_manager.py @@ -0,0 +1,395 @@ +""" +Index Manager - Core indexing logic +""" + +import json +import tempfile +from pathlib import Path +from typing import Optional + +from auggie_sdk.context import DirectContext, File + +from .github_client import GitHubClient +from .models import FileChange, IndexConfig, IndexResult, IndexState, RepositoryInfo + +DEFAULT_MAX_COMMITS = 100 +DEFAULT_MAX_FILES = 500 + + +class IndexManager: + """Index Manager - Core indexing logic for GitHub repositories.""" + + def __init__( + self, context: DirectContext, config: IndexConfig, state_path: str + ) -> None: + """ + Initialize the IndexManager. + + Args: + context: DirectContext instance for indexing operations. + config: Configuration for the indexing operation. + state_path: Path to the state file for persistence. + """ + self._context = context + self._config = config + self._state_path = state_path + self._github = GitHubClient(config.githubToken) + + def resolve_commit_sha(self) -> None: + """ + Resolve the current commit ref to an actual commit SHA. + + This handles cases where GITHUB_SHA might be "HEAD" or a branch name. + Updates the config.currentCommit with the resolved SHA. + """ + resolved_sha = self._github.resolve_ref( + self._config.owner, self._config.repo, self._config.currentCommit + ) + self._config.currentCommit = resolved_sha + + def _load_state(self) -> Optional[IndexState]: + """ + Load index state from file system. + + EXTENDING TO OTHER STORAGE BACKENDS: + Replace this method to load state from your preferred storage: + - Redis: Use redis-py client to GET the state JSON + - S3: Use boto3 to get_object from S3 bucket + - Database: Query your database for the state record + + Example for Redis: + import redis + r = redis.Redis.from_url(redis_url) + data = r.get(state_key) + return json.loads(data) if data else None + + Example for S3: + import boto3 + s3 = boto3.client('s3') + response = s3.get_object(Bucket=bucket, Key=key) + data = response['Body'].read().decode('utf-8') + return json.loads(data) + + Returns: + The loaded IndexState or None if the file doesn't exist. + """ + try: + with open(self._state_path, "r", encoding="utf-8") as f: + return json.load(f) + except FileNotFoundError: + return None + + def _save_state(self, state: IndexState) -> None: + """ + Save index state to file system. + + EXTENDING TO OTHER STORAGE BACKENDS: + Replace this method to save state to your preferred storage: + - Redis: Use redis-py client to SET the state JSON + - S3: Use boto3 to put_object to S3 bucket + - Database: Insert or update the state record in your database + + Example for Redis: + import redis + r = redis.Redis.from_url(redis_url) + r.set(state_key, json.dumps(state)) + + Example for S3: + import boto3 + s3 = boto3.client('s3') + s3.put_object( + Bucket=bucket, + Key=key, + Body=json.dumps(state), + ContentType='application/json' + ) + + Note: The state is just a JSON object (IndexState type) that can be + serialized and stored anywhere. For distributed systems, consider using + Redis or a database for shared state across multiple workers. + + Args: + state: The IndexState to save. + """ + # Ensure directory exists + Path(self._state_path).parent.mkdir(parents=True, exist_ok=True) + + # Write state to file + with open(self._state_path, "w", encoding="utf-8") as f: + json.dump(state, f, indent=2) + + def index(self) -> IndexResult: + """ + Main indexing entry point. + + Returns: + IndexResult with success status and indexing details. + """ + print( + f"Starting index for {self._config.owner}/{self._config.repo}" + f"@{self._config.branch}" + ) + + try: + # Load previous state + previous_state = self._load_state() + + # If we have previous state, we'll need to create a new context with the imported state + # For now, we'll handle this in the incremental update logic + + # Determine if we need full re-index + should_reindex, reason = self._should_full_reindex(previous_state) + + if should_reindex: + return self._full_reindex(reason) + + # Perform incremental update + # previous_state is guaranteed to be non-null here + if not previous_state: + raise RuntimeError("previous_state should not be None at this point") + return self._incremental_update(previous_state) + except Exception as error: + print(f"Indexing failed: {error}") + return IndexResult( + success=False, + type="full", + filesIndexed=0, + filesDeleted=0, + checkpointId="", + commitSha=self._config.currentCommit, + error=str(error), + ) + + def _should_full_reindex( + self, previous_state: Optional[IndexState] + ) -> tuple[bool, Optional[str]]: + """ + Determine if full re-index is needed. + + Args: + previous_state: The previous index state, or None if first run. + + Returns: + Tuple of (should_reindex, reason). + """ + # No previous state - first run + if not previous_state: + return (True, "first_run") + + # Different repository + if ( + previous_state["repository"]["owner"] != self._config.owner + or previous_state["repository"]["name"] != self._config.repo + ): + return (True, "different_repository") + + # Same commit - no changes + if previous_state["lastCommitSha"] == self._config.currentCommit: + print("No changes detected") + return (False, None) + + # Check for force push + is_force_push = self._github.is_force_push( + self._config.owner, + self._config.repo, + previous_state["lastCommitSha"], + self._config.currentCommit, + ) + + if is_force_push: + return (True, "force_push") + + # Get comparison + comparison = self._github.compare_commits( + self._config.owner, + self._config.repo, + previous_state["lastCommitSha"], + self._config.currentCommit, + ) + + # Too many commits + max_commits = self._config.maxCommits or DEFAULT_MAX_COMMITS + if comparison["commits"] > max_commits: + return ( + True, + f"too_many_commits ({comparison['commits']} > {max_commits})", + ) + + # Too many file changes + max_files = self._config.maxFiles or DEFAULT_MAX_FILES + if comparison["totalChanges"] > max_files: + return ( + True, + f"too_many_files ({comparison['totalChanges']} > {max_files})", + ) + + # Check if ignore files changed + ignore_changed = self._github.ignore_files_changed( + self._config.owner, + self._config.repo, + previous_state["lastCommitSha"], + self._config.currentCommit, + ) + + if ignore_changed: + return (True, "ignore_files_changed") + + return (False, None) + + def _full_reindex(self, reason: Optional[str]) -> IndexResult: + """ + Perform full repository re-index. + + Args: + reason: The reason for the full re-index. + + Returns: + IndexResult with the result of the full re-index. + """ + print(f"Performing full re-index (reason: {reason or 'unknown'})") + + # Download entire repository as tarball + files = self._github.download_tarball( + self._config.owner, self._config.repo, self._config.currentCommit + ) + + # Add all files to index + files_to_index = [ + File(path=path, contents=contents) for path, contents in files.items() + ] + + print(f"Adding {len(files_to_index)} files to index...") + self._context.add_to_index(files_to_index) + + # Export DirectContext state + context_state = self._context.export() + context_state_dict = context_state.to_dict() + + new_state: IndexState = { + "contextState": context_state_dict, + "lastCommitSha": self._config.currentCommit, + "repository": RepositoryInfo( + owner=self._config.owner, + name=self._config.repo, + ), + } + + # Save state + self._save_state(new_state) + + return IndexResult( + success=True, + type="full", + filesIndexed=len(files_to_index), + filesDeleted=0, + checkpointId=context_state.checkpoint_id or "", + commitSha=self._config.currentCommit, + reindexReason=reason, + ) + + def _incremental_update(self, previous_state: IndexState) -> IndexResult: + """ + Perform incremental update. + + Args: + previous_state: The previous index state. + + Returns: + IndexResult with the result of the incremental update. + """ + print("Performing incremental update...") + + # Create a temporary file with the previous context state + # Use delete=False because Windows can't reopen a NamedTemporaryFile while it's open + temp_file = tempfile.NamedTemporaryFile( + mode="w", suffix=".json", prefix="github-indexer-incremental-", delete=False + ) + temp_path = Path(temp_file.name) + try: + json.dump(previous_state["contextState"], temp_file, indent=2) + temp_file.close() # Close before reading on Windows + + # Create a new context from the previous state + self._context = DirectContext.import_from_file( + str(temp_path), + api_key=self._config.apiToken, + api_url=self._config.apiUrl, + ) + finally: + temp_path.unlink(missing_ok=True) + + # Get file changes + comparison = self._github.compare_commits( + self._config.owner, + self._config.repo, + previous_state["lastCommitSha"], + self._config.currentCommit, + ) + + # Process changes + files_to_add, files_to_delete = self._process_file_changes(comparison["files"]) + + print(f"Adding {len(files_to_add)} files, deleting {len(files_to_delete)} files") + + # Update index + if files_to_add: + self._context.add_to_index(files_to_add) + + if files_to_delete: + self._context.remove_from_index(files_to_delete) + + # Export DirectContext state + context_state = self._context.export() + context_state_dict = context_state.to_dict() + + new_state: IndexState = { + "contextState": context_state_dict, + "lastCommitSha": self._config.currentCommit, + "repository": previous_state["repository"], + } + + # Save state + self._save_state(new_state) + + return IndexResult( + success=True, + type="incremental", + filesIndexed=len(files_to_add), + filesDeleted=len(files_to_delete), + checkpointId=context_state.checkpoint_id or "", + commitSha=self._config.currentCommit, + ) + + def _process_file_changes( + self, changes: list[FileChange] + ) -> tuple[list[File], list[str]]: + """ + Process file changes and categorize them for indexing. + + Args: + changes: List of file changes from the comparison. + + Returns: + Tuple of (files_to_add, files_to_delete). + """ + files_to_add: list[File] = [] + files_to_delete: list[str] = [] + + for change in changes: + if change.status in ("added", "modified"): + if change.contents: + files_to_add.append( + File(path=change.path, contents=change.contents) + ) + elif change.status == "removed": + files_to_delete.append(change.path) + elif change.status == "renamed": + if change.previousFilename: + files_to_delete.append(change.previousFilename) + if change.contents: + files_to_add.append( + File(path=change.path, contents=change.contents) + ) + + return files_to_add, files_to_delete + diff --git a/src/main.py b/src/main.py new file mode 100644 index 0000000..fd10065 --- /dev/null +++ b/src/main.py @@ -0,0 +1,167 @@ +#!/usr/bin/env python3 +""" +Main entry point for GitHub Action Indexer + +Usage: + cd examples/python-sdk/context + python -m github_action_indexer index +""" + +import os +import re +import sys + +from auggie_sdk.context import DirectContext + +from .index_manager import IndexManager +from .models import IndexConfig + + +def get_api_credentials() -> tuple[str, str]: + """Get API credentials from environment variables.""" + api_token = os.environ.get("AUGMENT_API_TOKEN") + if not api_token: + raise ValueError("AUGMENT_API_TOKEN environment variable is required") + + api_url = os.environ.get("AUGMENT_API_URL") + if not api_url: + raise ValueError( + "AUGMENT_API_URL environment variable is required. Please set it to your " + "tenant-specific URL (e.g., 'https://your-tenant.api.augmentcode.com/')" + ) + + return api_token, api_url + + +def parse_repository_info() -> tuple[str, str, str, str]: + """ + Parse repository information from environment variables. + Returns (owner, repo, branch, current_commit). + """ + repository = os.environ.get("GITHUB_REPOSITORY", "") + parts = repository.split("/") + + if len(parts) != 2 or not parts[0] or not parts[1]: + raise ValueError('GITHUB_REPOSITORY must be in format "owner/repo"') + + owner, repo = parts + + # Extract branch name from GitHub ref + github_ref = os.environ.get("GITHUB_REF", "") + github_ref_name = os.environ.get("GITHUB_REF_NAME", "") + + if github_ref.startswith("refs/heads/"): + branch = github_ref_name + elif github_ref.startswith("refs/tags/"): + branch = f"tag/{github_ref_name}" + elif github_ref_name: + branch = github_ref_name + else: + branch = os.environ.get("BRANCH", "main") + + current_commit = os.environ.get("GITHUB_SHA", "") + if not current_commit: + raise ValueError("GITHUB_SHA environment variable is required") + + return owner, repo, branch, current_commit + + +def load_config() -> IndexConfig: + """Load configuration from environment variables.""" + github_token = os.environ.get("GITHUB_TOKEN") + if not github_token: + raise ValueError("GITHUB_TOKEN environment variable is required") + + api_token, api_url = get_api_credentials() + owner, repo, branch, current_commit = parse_repository_info() + + max_commits = os.environ.get("MAX_COMMITS") + max_files = os.environ.get("MAX_FILES") + + return IndexConfig( + apiToken=api_token, + apiUrl=api_url, + githubToken=github_token, + owner=owner, + repo=repo, + branch=branch, + currentCommit=current_commit, + maxCommits=int(max_commits) if max_commits else None, + maxFiles=int(max_files) if max_files else None, + ) + + +def get_state_path(branch: str) -> str: + """Get the state file path for the current branch.""" + sanitized_branch = re.sub(r"[^a-zA-Z0-9\-_]", "-", branch) + return os.environ.get( + "STATE_PATH", f".augment-index-state/{sanitized_branch}/state.json" + ) + + +def main() -> None: + """Main function.""" + print("GitHub Action Indexer - Starting...") + + try: + # Load configuration + config = load_config() + state_path = get_state_path(config.branch) + + print(f"Repository: {config.owner}/{config.repo}") + print(f"Branch: {config.branch}") + print(f"Commit ref: {config.currentCommit}") + print(f"State path: {state_path}") + + # Create DirectContext + context = DirectContext.create(api_key=config.apiToken, api_url=config.apiUrl) + + # Create index manager and resolve commit SHA + manager = IndexManager(context, config, state_path) + manager.resolve_commit_sha() + + print(f"Resolved commit SHA: {config.currentCommit}") + + # Perform indexing + result = manager.index() + + # Print results + print("\n=== Indexing Results ===") + print(f"Success: {result.success}") + print(f"Type: {result.type}") + print(f"Files Indexed: {result.filesIndexed}") + print(f"Files Deleted: {result.filesDeleted}") + print(f"Checkpoint ID: {result.checkpointId}") + print(f"Commit SHA: {result.commitSha}") + + if result.reindexReason: + print(f"Re-index Reason: {result.reindexReason}") + + if result.error: + print(f"Error: {result.error}", file=sys.stderr) + sys.exit(1) + + # Set GitHub Actions output + github_output = os.environ.get("GITHUB_OUTPUT") + if github_output: + output_lines = [ + f"success={result.success}", + f"type={result.type}", + f"files_indexed={result.filesIndexed}", + f"files_deleted={result.filesDeleted}", + f"checkpoint_id={result.checkpointId}", + f"commit_sha={result.commitSha}", + ] + with open(github_output, "a") as f: + f.write("\n".join(output_lines) + "\n") + + print("\nIndexing completed successfully!") + + except Exception as error: + print(f"Fatal error: {error}", file=sys.stderr) + sys.exit(1) + + +if __name__ == "__main__": + main() + diff --git a/src/models.py b/src/models.py new file mode 100644 index 0000000..8b3dfc0 --- /dev/null +++ b/src/models.py @@ -0,0 +1,131 @@ +""" +Types for the GitHub Action Indexer + +This module defines the data types used by the GitHub Action Indexer +for tracking index state, file changes, configuration, and results. +""" + +from dataclasses import dataclass +from typing import Literal, Optional + +from typing_extensions import TypedDict + +from auggie_sdk.context.models import DirectContextState + + +class RepositoryInfo(TypedDict): + """Repository information for index state.""" + + owner: str # Repository owner + name: str # Repository name + + +class IndexState(TypedDict): + """ + Persistent state for the GitHub Action Indexer. + + This state is stored between indexing runs to enable incremental indexing. + """ + + contextState: DirectContextState + """DirectContext state (checkpoint, blobs, etc.)""" + + lastCommitSha: str + """Last indexed commit SHA (must be a full 40-character SHA, not a ref like 'HEAD')""" + + repository: RepositoryInfo + """Repository information - used to verify we're indexing the same repository""" + + +@dataclass +class FileChange: + """ + Represents a file change detected between commits. + + Used to track what files need to be indexed or removed from the index. + """ + + path: str + """File path""" + + status: Literal["added", "modified", "removed", "renamed"] + """Change status: added, modified, removed, renamed""" + + previousFilename: Optional[str] = None + """Previous filename (for renames)""" + + contents: Optional[str] = None + """File contents (for added/modified files)""" + + oldBlobName: Optional[str] = None + """Blob name from previous index (for modified/removed files)""" + + +@dataclass +class IndexConfig: + """ + Configuration for the GitHub Action Indexer. + + Contains all the settings needed to perform indexing of a GitHub repository. + """ + + apiToken: str + """Augment API token""" + + apiUrl: str + """Augment API URL (provided via AUGMENT_API_URL env var)""" + + githubToken: str + """GitHub token""" + + owner: str + """Repository owner""" + + repo: str + """Repository name""" + + branch: str + """Branch to index""" + + currentCommit: str + """Current commit SHA""" + + maxCommits: Optional[int] = None + """Maximum commits before full re-index""" + + maxFiles: Optional[int] = None + """Maximum file changes before full re-index""" + + +@dataclass +class IndexResult: + """ + Result from an indexing operation. + + Contains information about what was indexed and whether it was successful. + """ + + success: bool + """Whether indexing was successful""" + + type: Literal["full", "incremental", "no-changes"] + """Type of indexing performed""" + + filesIndexed: int + """Number of files indexed""" + + filesDeleted: int + """Number of files deleted""" + + checkpointId: str + """New checkpoint ID""" + + commitSha: str + """Commit SHA that was indexed""" + + error: Optional[str] = None + """Error message if failed""" + + reindexReason: Optional[str] = None + """Reason for full re-index (if applicable)""" + diff --git a/src/search.py b/src/search.py new file mode 100644 index 0000000..fdac426 --- /dev/null +++ b/src/search.py @@ -0,0 +1,132 @@ +#!/usr/bin/env python3 +""" +CLI tool to search the indexed repository + +Usage: + cd examples/python-sdk/context + python -m github_action_indexer search "your search query" + python -m github_action_indexer search "your search query" --max-chars 5000 +""" + +import argparse +import json +import os +import re +import sys +import tempfile +from pathlib import Path +from typing import Optional + +from auggie_sdk.context import DirectContext + +from .models import IndexState + + +def get_state_path() -> str: + """Get the state file path for the current branch.""" + branch = os.environ.get("BRANCH", "main") + sanitized_branch = re.sub(r"[^a-zA-Z0-9\-_]", "-", branch) + return os.environ.get( + "STATE_PATH", f".augment-index-state/{sanitized_branch}/state.json" + ) + + +def load_state(state_path: str) -> Optional[IndexState]: + """Load index state from file system.""" + try: + with open(state_path, "r") as f: + data = f.read() + return json.loads(data) + except FileNotFoundError: + return None + + +def main() -> None: + """Main search function.""" + # Parse command line arguments + parser = argparse.ArgumentParser( + description="Search the indexed repository", + epilog='Example: python search.py "authentication functions"', + ) + parser.add_argument("query", help="Search query") + parser.add_argument( + "--max-chars", + type=int, + help="Maximum number of characters in output", + dest="max_chars", + ) + args = parser.parse_args() + + # Get API credentials + api_token = os.environ.get("AUGMENT_API_TOKEN") + if not api_token: + print("Error: AUGMENT_API_TOKEN environment variable is required", file=sys.stderr) + sys.exit(1) + + api_url = os.environ.get("AUGMENT_API_URL") + if not api_url: + print( + "Error: AUGMENT_API_URL environment variable is required. Please set it to your " + "tenant-specific URL (e.g., 'https://your-tenant.api.augmentcode.com/')", + file=sys.stderr, + ) + sys.exit(1) + + print(f'Searching for: "{args.query}"') + if args.max_chars is not None: + print(f"Limiting results to max {args.max_chars} characters\n") + else: + print() + + try: + # Load the index state first + state_path = get_state_path() + print(f"Loading index state from: {state_path}") + state = load_state(state_path) + + if not state: + print("Error: No index state found. Run indexing first.", file=sys.stderr) + print(" python -m github_action_indexer index", file=sys.stderr) + sys.exit(1) + + # Create a temporary file with the context state for import + # Use delete=False because Windows can't reopen a NamedTemporaryFile while it's open + temp_file = tempfile.NamedTemporaryFile( + mode="w", suffix=".json", prefix="github-indexer-state-", delete=False + ) + temp_path = Path(temp_file.name) + try: + json.dump(state["contextState"], temp_file, indent=2) + temp_file.close() # Close before reading on Windows + + # Import state using DirectContext.import_from_file + context = DirectContext.import_from_file( + str(temp_path), api_key=api_token, api_url=api_url + ) + finally: + temp_path.unlink(missing_ok=True) + + file_count = len(state["contextState"].get("blobs", [])) + + print(f"Loaded index: {file_count} files indexed") + print(f"Repository: {state['repository']['owner']}/{state['repository']['name']}") + print(f"Last indexed commit: {state['lastCommitSha']}\n") + + # Perform search with optional character limit + results = context.search(args.query, max_output_length=args.max_chars) + + if not results or results.strip() == "": + print("No results found.") + return + + print("Search results:\n") + print(results) + + except Exception as error: + print(f"Search failed: {error}", file=sys.stderr) + sys.exit(1) + + +if __name__ == "__main__": + main() + From d4564b840194d25c58612cdc8aa37da24269d5ad Mon Sep 17 00:00:00 2001 From: Rich Hankins Date: Mon, 15 Dec 2025 22:28:37 +0000 Subject: [PATCH 2/8] Add the workflow requiremens --- requirements.txt | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..74d3b3a --- /dev/null +++ b/requirements.txt @@ -0,0 +1,13 @@ +# GitHub Action Indexer dependencies +# Core SDK (from parent package) +# For running standalone, also install context_sdk from parent + +# GitHub API client +PyGithub>=2.1.0 + +# HTTP requests (for tarball download) +requests>=2.25.0 + +# Gitignore-style pattern matching +pathspec>=0.11.0 + From e23992122e7530b069ddbbe50f7f57d8ff1a34d7 Mon Sep 17 00:00:00 2001 From: Rich Hankins Date: Mon, 15 Dec 2025 22:32:19 +0000 Subject: [PATCH 3/8] Update requirements --- requirements.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/requirements.txt b/requirements.txt index 74d3b3a..4d507b3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,8 @@ # GitHub Action Indexer dependencies # Core SDK (from parent package) # For running standalone, also install context_sdk from parent +# Augment SDK for indexing and search +auggie-sdk>=0.1.0 # GitHub API client PyGithub>=2.1.0 From 5bec0e1b5f9476ecac0168a0beb1ef4458991acc Mon Sep 17 00:00:00 2001 From: Rich Hankins Date: Mon, 15 Dec 2025 22:51:40 +0000 Subject: [PATCH 4/8] Retry workflow --- .github/workflows/index.yml | 2 +- {src => augment_indexer}/__init__.py | 0 {src => augment_indexer}/file_filter.py | 0 {src => augment_indexer}/github_client.py | 0 {src => augment_indexer}/index_manager.py | 0 {src => augment_indexer}/main.py | 0 {src => augment_indexer}/models.py | 0 {src => augment_indexer}/search.py | 0 8 files changed, 1 insertion(+), 1 deletion(-) rename {src => augment_indexer}/__init__.py (100%) rename {src => augment_indexer}/file_filter.py (100%) rename {src => augment_indexer}/github_client.py (100%) rename {src => augment_indexer}/index_manager.py (100%) rename {src => augment_indexer}/main.py (100%) rename {src => augment_indexer}/models.py (100%) rename {src => augment_indexer}/search.py (100%) diff --git a/.github/workflows/index.yml b/.github/workflows/index.yml index 43c349a..9d1811e 100644 --- a/.github/workflows/index.yml +++ b/.github/workflows/index.yml @@ -49,7 +49,7 @@ jobs: - name: Index repository id: index - run: python src/main.py + run: python -m augment_indexer.main env: AUGMENT_API_TOKEN: ${{ secrets.AUGMENT_API_TOKEN }} AUGMENT_API_URL: ${{ secrets.AUGMENT_API_URL }} diff --git a/src/__init__.py b/augment_indexer/__init__.py similarity index 100% rename from src/__init__.py rename to augment_indexer/__init__.py diff --git a/src/file_filter.py b/augment_indexer/file_filter.py similarity index 100% rename from src/file_filter.py rename to augment_indexer/file_filter.py diff --git a/src/github_client.py b/augment_indexer/github_client.py similarity index 100% rename from src/github_client.py rename to augment_indexer/github_client.py diff --git a/src/index_manager.py b/augment_indexer/index_manager.py similarity index 100% rename from src/index_manager.py rename to augment_indexer/index_manager.py diff --git a/src/main.py b/augment_indexer/main.py similarity index 100% rename from src/main.py rename to augment_indexer/main.py diff --git a/src/models.py b/augment_indexer/models.py similarity index 100% rename from src/models.py rename to augment_indexer/models.py diff --git a/src/search.py b/augment_indexer/search.py similarity index 100% rename from src/search.py rename to augment_indexer/search.py From 480fd137b85df040d8657eac2f86e8488027c32d Mon Sep 17 00:00:00 2001 From: Rich Hankins Date: Mon, 15 Dec 2025 23:22:56 +0000 Subject: [PATCH 5/8] debug --- augment_indexer/index_manager.py | 7 ++++++- augment_indexer/main.py | 5 ++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/augment_indexer/index_manager.py b/augment_indexer/index_manager.py index c2bf48f..445893a 100644 --- a/augment_indexer/index_manager.py +++ b/augment_indexer/index_manager.py @@ -112,11 +112,16 @@ def _save_state(self, state: IndexState) -> None: state: The IndexState to save. """ # Ensure directory exists - Path(self._state_path).parent.mkdir(parents=True, exist_ok=True) + # Path(self._state_path).parent.mkdir(parents=True, exist_ok=True) + state_dir = Path(self._state_path).parent + state_dir.mkdir(parents=True, exist_ok=True) + print(f"Saving state to {self._state_path}") # Write state to file with open(self._state_path, "w", encoding="utf-8") as f: json.dump(state, f, indent=2) + + print(f"State saved successfully ({Path(self._state_path).stat().st_size} bytes)") def index(self) -> IndexResult: """ diff --git a/augment_indexer/main.py b/augment_indexer/main.py index fd10065..ce0046e 100644 --- a/augment_indexer/main.py +++ b/augment_indexer/main.py @@ -102,6 +102,7 @@ def get_state_path(branch: str) -> str: def main() -> None: """Main function.""" print("GitHub Action Indexer - Starting...") + print(f"Current working directory: {os.getcwd()}") try: # Load configuration @@ -111,7 +112,9 @@ def main() -> None: print(f"Repository: {config.owner}/{config.repo}") print(f"Branch: {config.branch}") print(f"Commit ref: {config.currentCommit}") - print(f"State path: {state_path}") + # print(f"State path: {state_path}") + print(f"State path (relative): {state_path}") + print(f"State path (absolute): {os.path.abspath(state_path)}") # Create DirectContext context = DirectContext.create(api_key=config.apiToken, api_url=config.apiUrl) From 22149b7d8a487d270948d486735ad5457536ea56 Mon Sep 17 00:00:00 2001 From: Rich Hankins Date: Mon, 15 Dec 2025 23:29:44 +0000 Subject: [PATCH 6/8] include hidden files --- .github/workflows/index.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/index.yml b/.github/workflows/index.yml index 9d1811e..6a1fec0 100644 --- a/.github/workflows/index.yml +++ b/.github/workflows/index.yml @@ -77,4 +77,5 @@ jobs: name: index-state path: .augment-index-state/ retention-days: 30 + include-hidden-files: true From 3810f98a99ea7459102ffcf97e6b46499d92ead7 Mon Sep 17 00:00:00 2001 From: Rich Hankins Date: Tue, 16 Dec 2025 00:50:11 +0000 Subject: [PATCH 7/8] remove the augment_indexer --- .github/workflows/index.yml | 81 ------- augment_indexer/__init__.py | 24 -- augment_indexer/file_filter.py | 123 ---------- augment_indexer/github_client.py | 307 ------------------------ augment_indexer/index_manager.py | 400 ------------------------------- augment_indexer/main.py | 170 ------------- augment_indexer/models.py | 131 ---------- augment_indexer/search.py | 132 ---------- 8 files changed, 1368 deletions(-) delete mode 100644 .github/workflows/index.yml delete mode 100644 augment_indexer/__init__.py delete mode 100644 augment_indexer/file_filter.py delete mode 100644 augment_indexer/github_client.py delete mode 100644 augment_indexer/index_manager.py delete mode 100644 augment_indexer/main.py delete mode 100644 augment_indexer/models.py delete mode 100644 augment_indexer/search.py diff --git a/.github/workflows/index.yml b/.github/workflows/index.yml deleted file mode 100644 index 6a1fec0..0000000 --- a/.github/workflows/index.yml +++ /dev/null @@ -1,81 +0,0 @@ -name: Index Repository - -on: - push: - branches: - - main - - develop - - 'feature/**' # Index feature branches - - 'release/**' # Index release branches - workflow_dispatch: - inputs: - branch: - description: 'Branch to index (leave empty for current branch)' - required: false - type: string - force_full_reindex: - description: 'Force full re-index' - required: false - type: boolean - default: false - -jobs: - index: - runs-on: ubuntu-latest - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - with: - fetch-depth: 0 # Full history for comparison - - - name: Setup Python - uses: actions/setup-python@v5 - with: - python-version: '3.11' - cache: 'pip' - - - name: Install dependencies - run: pip install -r requirements.txt - - - name: Restore index state - uses: actions/cache@v4 - with: - path: .augment-index-state - # Use branch-specific cache key - key: augment-index-${{ github.ref_name }}-${{ github.sha }} - restore-keys: | - augment-index-${{ github.ref_name }}- - - - name: Index repository - id: index - run: python -m augment_indexer.main - env: - AUGMENT_API_TOKEN: ${{ secrets.AUGMENT_API_TOKEN }} - AUGMENT_API_URL: ${{ secrets.AUGMENT_API_URL }} - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - STORAGE_TYPE: file - # Branch-specific state path (automatically determined from GITHUB_REF) - # STATE_PATH is optional - defaults to .augment-index-state/{branch}/state.json - MAX_COMMITS: 100 - MAX_FILES: 500 - - - name: Print results - if: always() - run: | - echo "Success: ${{ steps.index.outputs.success }}" - echo "Type: ${{ steps.index.outputs.type }}" - echo "Files Indexed: ${{ steps.index.outputs.files_indexed }}" - echo "Files Deleted: ${{ steps.index.outputs.files_deleted }}" - echo "Checkpoint ID: ${{ steps.index.outputs.checkpoint_id }}" - echo "Commit SHA: ${{ steps.index.outputs.commit_sha }}" - - - name: Upload state artifact - if: success() - uses: actions/upload-artifact@v4 - with: - name: index-state - path: .augment-index-state/ - retention-days: 30 - include-hidden-files: true - diff --git a/augment_indexer/__init__.py b/augment_indexer/__init__.py deleted file mode 100644 index 499dfe6..0000000 --- a/augment_indexer/__init__.py +++ /dev/null @@ -1,24 +0,0 @@ -""" -GitHub Action Repository Indexer - -A Python example showing how to index a GitHub repository using the Augment SDK -Direct Mode with incremental updates. - -See README.md for usage instructions. -""" - -from .models import FileChange, IndexConfig, IndexResult, IndexState -from .file_filter import should_filter_file -from .github_client import GitHubClient -from .index_manager import IndexManager - -__all__ = [ - "FileChange", - "IndexConfig", - "IndexResult", - "IndexState", - "should_filter_file", - "GitHubClient", - "IndexManager", -] - diff --git a/augment_indexer/file_filter.py b/augment_indexer/file_filter.py deleted file mode 100644 index 88ab035..0000000 --- a/augment_indexer/file_filter.py +++ /dev/null @@ -1,123 +0,0 @@ -""" -File filtering logic for GitHub repository indexing. -""" - -import re -from pathlib import Path -from typing import Optional - -# Keyish pattern regex - matches files that likely contain secrets/keys -KEYISH_PATTERN = re.compile( - r'^(\.git|.*\.pem|.*\.key|.*\.pfx|.*\.p12|.*\.jks|.*\.keystore|.*\.pkcs12|.*\.crt|.*\.cer|id_rsa|id_ed25519|id_ecdsa|id_dsa)$' -) - -# Default max file size in bytes (1 MB) -DEFAULT_MAX_FILE_SIZE = 1024 * 1024 # 1 MB - - -def always_ignore_path(path: str) -> bool: - """ - Check if a path should always be ignored (security measure). - - Args: - path: The file path to check. - - Returns: - True if the path contains ".." and should be ignored. - """ - return ".." in path - - -def is_keyish_path(path: str) -> bool: - """ - Check if a path matches the keyish pattern (secrets/keys). - - Args: - path: The file path to check. - - Returns: - True if the filename matches patterns for secret/key files. - """ - # Extract filename from path - filename = Path(path).name - return bool(KEYISH_PATTERN.match(filename)) - - -def is_valid_file_size(size_bytes: int, max_file_size: int = DEFAULT_MAX_FILE_SIZE) -> bool: - """ - Check if file size is valid for upload. - - Args: - size_bytes: The size of the file in bytes. - max_file_size: Maximum allowed file size in bytes. Defaults to 1 MB. - - Returns: - True if the file size is within the allowed limit. - """ - return size_bytes <= max_file_size - - -def is_valid_utf8(content: bytes) -> bool: - """ - Check if file content is valid UTF-8 (not binary). - - Args: - content: The file content as bytes. - - Returns: - True if the content is valid UTF-8, False if it's binary or invalid. - """ - try: - content.decode("utf-8") - return True - except UnicodeDecodeError: - return False - - -def should_filter_file( - path: str, - content: bytes, - max_file_size: Optional[int] = None, -) -> dict: - """ - Check if a file should be filtered out. - - Returns {"filtered": True, "reason": "..."} if file should be skipped. - Returns {"filtered": False} if file should be included. - - Priority order (from file-filtering.md): - 1. Path validation (contains "..") - 2. File size check - 3. .augmentignore rules (checked by caller) - 4. Keyish patterns - 5. .gitignore rules (checked by caller) - 6. UTF-8 validation - - Args: - path: The file path to check. - content: The file content as bytes. - max_file_size: Maximum allowed file size in bytes. Defaults to DEFAULT_MAX_FILE_SIZE. - - Returns: - A dict with "filtered" (bool) and optionally "reason" (str) keys. - """ - effective_max_size = max_file_size if max_file_size is not None else DEFAULT_MAX_FILE_SIZE - - # 1. Check for ".." in path (security) - if always_ignore_path(path): - return {"filtered": True, "reason": "path_contains_dotdot"} - - # 2. Check file size - if not is_valid_file_size(len(content), effective_max_size): - return {"filtered": True, "reason": f"file_too_large ({len(content)} bytes)"} - - # 3. Check keyish patterns (secrets/keys) - if is_keyish_path(path): - return {"filtered": True, "reason": "keyish_pattern"} - - # 4. Check UTF-8 validity (binary detection) - if not is_valid_utf8(content): - return {"filtered": True, "reason": "binary_file"} - - return {"filtered": False} - diff --git a/augment_indexer/github_client.py b/augment_indexer/github_client.py deleted file mode 100644 index f69bd62..0000000 --- a/augment_indexer/github_client.py +++ /dev/null @@ -1,307 +0,0 @@ -""" -GitHub API client for fetching repository data. -""" - -import io -import tarfile - -import pathspec -import requests -from github import Github -from github.GithubException import GithubException - -from .file_filter import should_filter_file -from .models import FileChange - - -class GitHubClient: - """GitHub API client for fetching repository data.""" - - def __init__(self, token: str) -> None: - """ - Initialize the GitHub client with an authentication token. - - Args: - token: GitHub personal access token or GitHub App token. - """ - self._github = Github(token) - self._token = token - - def resolve_ref(self, owner: str, repo: str, ref: str) -> str: - """ - Resolve a ref (like "HEAD", "main", or a commit SHA) to a commit SHA. - - Args: - owner: Repository owner. - repo: Repository name. - ref: Git ref to resolve. - - Returns: - The full 40-character commit SHA. - - Raises: - Exception: If the ref cannot be resolved. - """ - try: - repository = self._github.get_repo(f"{owner}/{repo}") - commit = repository.get_commit(ref) - return commit.sha - except GithubException as error: - raise Exception( - f'Failed to resolve ref "{ref}" for {owner}/{repo}: {error}' - ) from error - - def download_tarball(self, owner: str, repo: str, ref: str) -> dict[str, str]: - """ - Download repository as tarball and extract files. - - Args: - owner: Repository owner. - repo: Repository name. - ref: Git ref to download. - - Returns: - Dictionary mapping file paths to their contents. - """ - print(f"Downloading tarball for {owner}/{repo}@{ref}...") - - repository = self._github.get_repo(f"{owner}/{repo}") - tarball_url = repository.get_archive_link("tarball", ref) - - # Download tarball (10 minute timeout to handle large repositories) - # Include auth header for private repos - headers = {"Authorization": f"Bearer {self._token}"} - response = requests.get(tarball_url, headers=headers, stream=True, timeout=600) - if not response.ok: - raise Exception(f"Failed to download tarball: {response.reason}") - - # Load ignore patterns - augmentignore, gitignore = self._load_ignore_patterns(owner, repo, ref) - - # Track filtering statistics - files: dict[str, str] = {} - total_files = 0 - filtered_files = 0 - filter_reasons: dict[str, int] = {} - - # Extract files from tarball - tarball_data = io.BytesIO(response.content) - with tarfile.open(fileobj=tarball_data, mode="r:gz") as tar: - for member in tar.getmembers(): - # Skip directories and symlinks - if not member.isfile(): - continue - - total_files += 1 - - # Remove the root directory prefix (e.g., "owner-repo-sha/") - path_parts = member.name.split("/") - path_parts.pop(0) # Remove first component - file_path = "/".join(path_parts) - - if not file_path: - continue - - # Read file contents - file_obj = tar.extractfile(member) - if file_obj is None: - continue - content_bytes = file_obj.read() - - # Apply filtering in priority order: - # 1. .augmentignore - if augmentignore and augmentignore.match_file(file_path): - filtered_files += 1 - filter_reasons["augmentignore"] = filter_reasons.get("augmentignore", 0) + 1 - continue - - # 2. Path validation, file size, keyish patterns, UTF-8 validation - filter_result = should_filter_file(path=file_path, content=content_bytes) - - if filter_result["filtered"]: - filtered_files += 1 - reason = filter_result.get("reason", "unknown") - filter_reasons[reason] = filter_reasons.get(reason, 0) + 1 - continue - - # 3. .gitignore (checked last) - if gitignore and gitignore.match_file(file_path): - filtered_files += 1 - filter_reasons["gitignore"] = filter_reasons.get("gitignore", 0) + 1 - continue - - # File passed all filters - try: - contents = content_bytes.decode("utf-8") - files[file_path] = contents - except UnicodeDecodeError: - # This should not happen if is_valid_utf8() is working correctly - filtered_files += 1 - filter_reasons["decode_error"] = filter_reasons.get("decode_error", 0) + 1 - print(f"Warning: File {file_path} passed UTF-8 validation but failed to decode") - - print(f"Extracted {len(files)} files from tarball") - print(f"Filtered {filtered_files} of {total_files} files. Reasons: {filter_reasons}") - return files - - def compare_commits( - self, owner: str, repo: str, base: str, head: str - ) -> dict: - """ - Compare two commits and get file changes. - """ - print(f"Comparing {base}...{head}...") - - repository = self._github.get_repo(f"{owner}/{repo}") - comparison = repository.compare(base, head) - - files: list[FileChange] = [] - - for file in comparison.files: - change = FileChange( - path=file.filename, - status=self._map_github_status(file.status), - previousFilename=file.previous_filename, - ) - - # Download file contents for added/modified files - if change.status in ("added", "modified"): - try: - contents = self.get_file_contents(owner, repo, file.filename, head) - change.contents = contents - except Exception as error: - print(f"Warning: Failed to download {file.filename}: {error}") - - files.append(change) - - return { - "files": files, - "commits": comparison.total_commits, - "totalChanges": len(comparison.files), - } - - def get_file_contents( - self, owner: str, repo: str, path: str, ref: str - ) -> str: - """ - Get file contents at a specific ref. - - Args: - owner: Repository owner. - repo: Repository name. - path: File path within the repository. - ref: Git ref to get contents at. - - Returns: - The file contents as a string. - - Raises: - Exception: If the path is not a file. - """ - repository = self._github.get_repo(f"{owner}/{repo}") - content = repository.get_contents(path, ref) - - if isinstance(content, list): - raise Exception(f"{path} is not a file") - - return content.decoded_content.decode("utf-8") - - def _load_ignore_patterns( - self, owner: str, repo: str, ref: str - ) -> tuple[pathspec.PathSpec | None, pathspec.PathSpec | None]: - """ - Load .gitignore and .augmentignore patterns separately. - - Returns both filters to maintain proper priority order: - .augmentignore → keyish → .gitignore - - Args: - owner: Repository owner. - repo: Repository name. - ref: Git ref to load patterns from. - - Returns: - Tuple of (augmentignore, gitignore) PathSpec objects, or None if not found. - """ - augmentignore: pathspec.PathSpec | None = None - gitignore: pathspec.PathSpec | None = None - - # Try to load .gitignore - try: - gitignore_content = self.get_file_contents(owner, repo, ".gitignore", ref) - gitignore = pathspec.PathSpec.from_lines("gitwildmatch", gitignore_content.splitlines()) - except Exception: - # .gitignore doesn't exist - pass - - # Try to load .augmentignore - try: - augmentignore_content = self.get_file_contents(owner, repo, ".augmentignore", ref) - augmentignore = pathspec.PathSpec.from_lines("gitwildmatch", augmentignore_content.splitlines()) - except Exception: - # .augmentignore doesn't exist - pass - - return augmentignore, gitignore - - def _map_github_status(self, status: str) -> str: - """ - Map GitHub file status to our FileChange status. - - Args: - status: GitHub file status string. - - Returns: - Normalized status string. - """ - status_map = { - "added": "added", - "modified": "modified", - "removed": "removed", - "renamed": "renamed", - } - return status_map.get(status, "modified") - - def ignore_files_changed( - self, owner: str, repo: str, base: str, head: str - ) -> bool: - """ - Check if ignore files changed between commits. - - Args: - owner: Repository owner. - repo: Repository name. - base: Base commit SHA. - head: Head commit SHA. - - Returns: - True if .gitignore or .augmentignore changed, False otherwise. - """ - repository = self._github.get_repo(f"{owner}/{repo}") - comparison = repository.compare(base, head) - - ignore_files = [".gitignore", ".augmentignore"] - return any(file.filename in ignore_files for file in comparison.files) - - def is_force_push( - self, owner: str, repo: str, base: str, head: str - ) -> bool: - """ - Check if the push was a force push. - - Args: - owner: Repository owner. - repo: Repository name. - base: Base commit SHA. - head: Head commit SHA. - - Returns: - True if the push was a force push, False otherwise. - """ - try: - repository = self._github.get_repo(f"{owner}/{repo}") - repository.compare(base, head) - return False - except GithubException: - # If comparison fails, it's likely a force push - return True diff --git a/augment_indexer/index_manager.py b/augment_indexer/index_manager.py deleted file mode 100644 index 445893a..0000000 --- a/augment_indexer/index_manager.py +++ /dev/null @@ -1,400 +0,0 @@ -""" -Index Manager - Core indexing logic -""" - -import json -import tempfile -from pathlib import Path -from typing import Optional - -from auggie_sdk.context import DirectContext, File - -from .github_client import GitHubClient -from .models import FileChange, IndexConfig, IndexResult, IndexState, RepositoryInfo - -DEFAULT_MAX_COMMITS = 100 -DEFAULT_MAX_FILES = 500 - - -class IndexManager: - """Index Manager - Core indexing logic for GitHub repositories.""" - - def __init__( - self, context: DirectContext, config: IndexConfig, state_path: str - ) -> None: - """ - Initialize the IndexManager. - - Args: - context: DirectContext instance for indexing operations. - config: Configuration for the indexing operation. - state_path: Path to the state file for persistence. - """ - self._context = context - self._config = config - self._state_path = state_path - self._github = GitHubClient(config.githubToken) - - def resolve_commit_sha(self) -> None: - """ - Resolve the current commit ref to an actual commit SHA. - - This handles cases where GITHUB_SHA might be "HEAD" or a branch name. - Updates the config.currentCommit with the resolved SHA. - """ - resolved_sha = self._github.resolve_ref( - self._config.owner, self._config.repo, self._config.currentCommit - ) - self._config.currentCommit = resolved_sha - - def _load_state(self) -> Optional[IndexState]: - """ - Load index state from file system. - - EXTENDING TO OTHER STORAGE BACKENDS: - Replace this method to load state from your preferred storage: - - Redis: Use redis-py client to GET the state JSON - - S3: Use boto3 to get_object from S3 bucket - - Database: Query your database for the state record - - Example for Redis: - import redis - r = redis.Redis.from_url(redis_url) - data = r.get(state_key) - return json.loads(data) if data else None - - Example for S3: - import boto3 - s3 = boto3.client('s3') - response = s3.get_object(Bucket=bucket, Key=key) - data = response['Body'].read().decode('utf-8') - return json.loads(data) - - Returns: - The loaded IndexState or None if the file doesn't exist. - """ - try: - with open(self._state_path, "r", encoding="utf-8") as f: - return json.load(f) - except FileNotFoundError: - return None - - def _save_state(self, state: IndexState) -> None: - """ - Save index state to file system. - - EXTENDING TO OTHER STORAGE BACKENDS: - Replace this method to save state to your preferred storage: - - Redis: Use redis-py client to SET the state JSON - - S3: Use boto3 to put_object to S3 bucket - - Database: Insert or update the state record in your database - - Example for Redis: - import redis - r = redis.Redis.from_url(redis_url) - r.set(state_key, json.dumps(state)) - - Example for S3: - import boto3 - s3 = boto3.client('s3') - s3.put_object( - Bucket=bucket, - Key=key, - Body=json.dumps(state), - ContentType='application/json' - ) - - Note: The state is just a JSON object (IndexState type) that can be - serialized and stored anywhere. For distributed systems, consider using - Redis or a database for shared state across multiple workers. - - Args: - state: The IndexState to save. - """ - # Ensure directory exists - # Path(self._state_path).parent.mkdir(parents=True, exist_ok=True) - state_dir = Path(self._state_path).parent - state_dir.mkdir(parents=True, exist_ok=True) - print(f"Saving state to {self._state_path}") - - # Write state to file - with open(self._state_path, "w", encoding="utf-8") as f: - json.dump(state, f, indent=2) - - print(f"State saved successfully ({Path(self._state_path).stat().st_size} bytes)") - - def index(self) -> IndexResult: - """ - Main indexing entry point. - - Returns: - IndexResult with success status and indexing details. - """ - print( - f"Starting index for {self._config.owner}/{self._config.repo}" - f"@{self._config.branch}" - ) - - try: - # Load previous state - previous_state = self._load_state() - - # If we have previous state, we'll need to create a new context with the imported state - # For now, we'll handle this in the incremental update logic - - # Determine if we need full re-index - should_reindex, reason = self._should_full_reindex(previous_state) - - if should_reindex: - return self._full_reindex(reason) - - # Perform incremental update - # previous_state is guaranteed to be non-null here - if not previous_state: - raise RuntimeError("previous_state should not be None at this point") - return self._incremental_update(previous_state) - except Exception as error: - print(f"Indexing failed: {error}") - return IndexResult( - success=False, - type="full", - filesIndexed=0, - filesDeleted=0, - checkpointId="", - commitSha=self._config.currentCommit, - error=str(error), - ) - - def _should_full_reindex( - self, previous_state: Optional[IndexState] - ) -> tuple[bool, Optional[str]]: - """ - Determine if full re-index is needed. - - Args: - previous_state: The previous index state, or None if first run. - - Returns: - Tuple of (should_reindex, reason). - """ - # No previous state - first run - if not previous_state: - return (True, "first_run") - - # Different repository - if ( - previous_state["repository"]["owner"] != self._config.owner - or previous_state["repository"]["name"] != self._config.repo - ): - return (True, "different_repository") - - # Same commit - no changes - if previous_state["lastCommitSha"] == self._config.currentCommit: - print("No changes detected") - return (False, None) - - # Check for force push - is_force_push = self._github.is_force_push( - self._config.owner, - self._config.repo, - previous_state["lastCommitSha"], - self._config.currentCommit, - ) - - if is_force_push: - return (True, "force_push") - - # Get comparison - comparison = self._github.compare_commits( - self._config.owner, - self._config.repo, - previous_state["lastCommitSha"], - self._config.currentCommit, - ) - - # Too many commits - max_commits = self._config.maxCommits or DEFAULT_MAX_COMMITS - if comparison["commits"] > max_commits: - return ( - True, - f"too_many_commits ({comparison['commits']} > {max_commits})", - ) - - # Too many file changes - max_files = self._config.maxFiles or DEFAULT_MAX_FILES - if comparison["totalChanges"] > max_files: - return ( - True, - f"too_many_files ({comparison['totalChanges']} > {max_files})", - ) - - # Check if ignore files changed - ignore_changed = self._github.ignore_files_changed( - self._config.owner, - self._config.repo, - previous_state["lastCommitSha"], - self._config.currentCommit, - ) - - if ignore_changed: - return (True, "ignore_files_changed") - - return (False, None) - - def _full_reindex(self, reason: Optional[str]) -> IndexResult: - """ - Perform full repository re-index. - - Args: - reason: The reason for the full re-index. - - Returns: - IndexResult with the result of the full re-index. - """ - print(f"Performing full re-index (reason: {reason or 'unknown'})") - - # Download entire repository as tarball - files = self._github.download_tarball( - self._config.owner, self._config.repo, self._config.currentCommit - ) - - # Add all files to index - files_to_index = [ - File(path=path, contents=contents) for path, contents in files.items() - ] - - print(f"Adding {len(files_to_index)} files to index...") - self._context.add_to_index(files_to_index) - - # Export DirectContext state - context_state = self._context.export() - context_state_dict = context_state.to_dict() - - new_state: IndexState = { - "contextState": context_state_dict, - "lastCommitSha": self._config.currentCommit, - "repository": RepositoryInfo( - owner=self._config.owner, - name=self._config.repo, - ), - } - - # Save state - self._save_state(new_state) - - return IndexResult( - success=True, - type="full", - filesIndexed=len(files_to_index), - filesDeleted=0, - checkpointId=context_state.checkpoint_id or "", - commitSha=self._config.currentCommit, - reindexReason=reason, - ) - - def _incremental_update(self, previous_state: IndexState) -> IndexResult: - """ - Perform incremental update. - - Args: - previous_state: The previous index state. - - Returns: - IndexResult with the result of the incremental update. - """ - print("Performing incremental update...") - - # Create a temporary file with the previous context state - # Use delete=False because Windows can't reopen a NamedTemporaryFile while it's open - temp_file = tempfile.NamedTemporaryFile( - mode="w", suffix=".json", prefix="github-indexer-incremental-", delete=False - ) - temp_path = Path(temp_file.name) - try: - json.dump(previous_state["contextState"], temp_file, indent=2) - temp_file.close() # Close before reading on Windows - - # Create a new context from the previous state - self._context = DirectContext.import_from_file( - str(temp_path), - api_key=self._config.apiToken, - api_url=self._config.apiUrl, - ) - finally: - temp_path.unlink(missing_ok=True) - - # Get file changes - comparison = self._github.compare_commits( - self._config.owner, - self._config.repo, - previous_state["lastCommitSha"], - self._config.currentCommit, - ) - - # Process changes - files_to_add, files_to_delete = self._process_file_changes(comparison["files"]) - - print(f"Adding {len(files_to_add)} files, deleting {len(files_to_delete)} files") - - # Update index - if files_to_add: - self._context.add_to_index(files_to_add) - - if files_to_delete: - self._context.remove_from_index(files_to_delete) - - # Export DirectContext state - context_state = self._context.export() - context_state_dict = context_state.to_dict() - - new_state: IndexState = { - "contextState": context_state_dict, - "lastCommitSha": self._config.currentCommit, - "repository": previous_state["repository"], - } - - # Save state - self._save_state(new_state) - - return IndexResult( - success=True, - type="incremental", - filesIndexed=len(files_to_add), - filesDeleted=len(files_to_delete), - checkpointId=context_state.checkpoint_id or "", - commitSha=self._config.currentCommit, - ) - - def _process_file_changes( - self, changes: list[FileChange] - ) -> tuple[list[File], list[str]]: - """ - Process file changes and categorize them for indexing. - - Args: - changes: List of file changes from the comparison. - - Returns: - Tuple of (files_to_add, files_to_delete). - """ - files_to_add: list[File] = [] - files_to_delete: list[str] = [] - - for change in changes: - if change.status in ("added", "modified"): - if change.contents: - files_to_add.append( - File(path=change.path, contents=change.contents) - ) - elif change.status == "removed": - files_to_delete.append(change.path) - elif change.status == "renamed": - if change.previousFilename: - files_to_delete.append(change.previousFilename) - if change.contents: - files_to_add.append( - File(path=change.path, contents=change.contents) - ) - - return files_to_add, files_to_delete - diff --git a/augment_indexer/main.py b/augment_indexer/main.py deleted file mode 100644 index ce0046e..0000000 --- a/augment_indexer/main.py +++ /dev/null @@ -1,170 +0,0 @@ -#!/usr/bin/env python3 -""" -Main entry point for GitHub Action Indexer - -Usage: - cd examples/python-sdk/context - python -m github_action_indexer index -""" - -import os -import re -import sys - -from auggie_sdk.context import DirectContext - -from .index_manager import IndexManager -from .models import IndexConfig - - -def get_api_credentials() -> tuple[str, str]: - """Get API credentials from environment variables.""" - api_token = os.environ.get("AUGMENT_API_TOKEN") - if not api_token: - raise ValueError("AUGMENT_API_TOKEN environment variable is required") - - api_url = os.environ.get("AUGMENT_API_URL") - if not api_url: - raise ValueError( - "AUGMENT_API_URL environment variable is required. Please set it to your " - "tenant-specific URL (e.g., 'https://your-tenant.api.augmentcode.com/')" - ) - - return api_token, api_url - - -def parse_repository_info() -> tuple[str, str, str, str]: - """ - Parse repository information from environment variables. - Returns (owner, repo, branch, current_commit). - """ - repository = os.environ.get("GITHUB_REPOSITORY", "") - parts = repository.split("/") - - if len(parts) != 2 or not parts[0] or not parts[1]: - raise ValueError('GITHUB_REPOSITORY must be in format "owner/repo"') - - owner, repo = parts - - # Extract branch name from GitHub ref - github_ref = os.environ.get("GITHUB_REF", "") - github_ref_name = os.environ.get("GITHUB_REF_NAME", "") - - if github_ref.startswith("refs/heads/"): - branch = github_ref_name - elif github_ref.startswith("refs/tags/"): - branch = f"tag/{github_ref_name}" - elif github_ref_name: - branch = github_ref_name - else: - branch = os.environ.get("BRANCH", "main") - - current_commit = os.environ.get("GITHUB_SHA", "") - if not current_commit: - raise ValueError("GITHUB_SHA environment variable is required") - - return owner, repo, branch, current_commit - - -def load_config() -> IndexConfig: - """Load configuration from environment variables.""" - github_token = os.environ.get("GITHUB_TOKEN") - if not github_token: - raise ValueError("GITHUB_TOKEN environment variable is required") - - api_token, api_url = get_api_credentials() - owner, repo, branch, current_commit = parse_repository_info() - - max_commits = os.environ.get("MAX_COMMITS") - max_files = os.environ.get("MAX_FILES") - - return IndexConfig( - apiToken=api_token, - apiUrl=api_url, - githubToken=github_token, - owner=owner, - repo=repo, - branch=branch, - currentCommit=current_commit, - maxCommits=int(max_commits) if max_commits else None, - maxFiles=int(max_files) if max_files else None, - ) - - -def get_state_path(branch: str) -> str: - """Get the state file path for the current branch.""" - sanitized_branch = re.sub(r"[^a-zA-Z0-9\-_]", "-", branch) - return os.environ.get( - "STATE_PATH", f".augment-index-state/{sanitized_branch}/state.json" - ) - - -def main() -> None: - """Main function.""" - print("GitHub Action Indexer - Starting...") - print(f"Current working directory: {os.getcwd()}") - - try: - # Load configuration - config = load_config() - state_path = get_state_path(config.branch) - - print(f"Repository: {config.owner}/{config.repo}") - print(f"Branch: {config.branch}") - print(f"Commit ref: {config.currentCommit}") - # print(f"State path: {state_path}") - print(f"State path (relative): {state_path}") - print(f"State path (absolute): {os.path.abspath(state_path)}") - - # Create DirectContext - context = DirectContext.create(api_key=config.apiToken, api_url=config.apiUrl) - - # Create index manager and resolve commit SHA - manager = IndexManager(context, config, state_path) - manager.resolve_commit_sha() - - print(f"Resolved commit SHA: {config.currentCommit}") - - # Perform indexing - result = manager.index() - - # Print results - print("\n=== Indexing Results ===") - print(f"Success: {result.success}") - print(f"Type: {result.type}") - print(f"Files Indexed: {result.filesIndexed}") - print(f"Files Deleted: {result.filesDeleted}") - print(f"Checkpoint ID: {result.checkpointId}") - print(f"Commit SHA: {result.commitSha}") - - if result.reindexReason: - print(f"Re-index Reason: {result.reindexReason}") - - if result.error: - print(f"Error: {result.error}", file=sys.stderr) - sys.exit(1) - - # Set GitHub Actions output - github_output = os.environ.get("GITHUB_OUTPUT") - if github_output: - output_lines = [ - f"success={result.success}", - f"type={result.type}", - f"files_indexed={result.filesIndexed}", - f"files_deleted={result.filesDeleted}", - f"checkpoint_id={result.checkpointId}", - f"commit_sha={result.commitSha}", - ] - with open(github_output, "a") as f: - f.write("\n".join(output_lines) + "\n") - - print("\nIndexing completed successfully!") - - except Exception as error: - print(f"Fatal error: {error}", file=sys.stderr) - sys.exit(1) - - -if __name__ == "__main__": - main() - diff --git a/augment_indexer/models.py b/augment_indexer/models.py deleted file mode 100644 index 8b3dfc0..0000000 --- a/augment_indexer/models.py +++ /dev/null @@ -1,131 +0,0 @@ -""" -Types for the GitHub Action Indexer - -This module defines the data types used by the GitHub Action Indexer -for tracking index state, file changes, configuration, and results. -""" - -from dataclasses import dataclass -from typing import Literal, Optional - -from typing_extensions import TypedDict - -from auggie_sdk.context.models import DirectContextState - - -class RepositoryInfo(TypedDict): - """Repository information for index state.""" - - owner: str # Repository owner - name: str # Repository name - - -class IndexState(TypedDict): - """ - Persistent state for the GitHub Action Indexer. - - This state is stored between indexing runs to enable incremental indexing. - """ - - contextState: DirectContextState - """DirectContext state (checkpoint, blobs, etc.)""" - - lastCommitSha: str - """Last indexed commit SHA (must be a full 40-character SHA, not a ref like 'HEAD')""" - - repository: RepositoryInfo - """Repository information - used to verify we're indexing the same repository""" - - -@dataclass -class FileChange: - """ - Represents a file change detected between commits. - - Used to track what files need to be indexed or removed from the index. - """ - - path: str - """File path""" - - status: Literal["added", "modified", "removed", "renamed"] - """Change status: added, modified, removed, renamed""" - - previousFilename: Optional[str] = None - """Previous filename (for renames)""" - - contents: Optional[str] = None - """File contents (for added/modified files)""" - - oldBlobName: Optional[str] = None - """Blob name from previous index (for modified/removed files)""" - - -@dataclass -class IndexConfig: - """ - Configuration for the GitHub Action Indexer. - - Contains all the settings needed to perform indexing of a GitHub repository. - """ - - apiToken: str - """Augment API token""" - - apiUrl: str - """Augment API URL (provided via AUGMENT_API_URL env var)""" - - githubToken: str - """GitHub token""" - - owner: str - """Repository owner""" - - repo: str - """Repository name""" - - branch: str - """Branch to index""" - - currentCommit: str - """Current commit SHA""" - - maxCommits: Optional[int] = None - """Maximum commits before full re-index""" - - maxFiles: Optional[int] = None - """Maximum file changes before full re-index""" - - -@dataclass -class IndexResult: - """ - Result from an indexing operation. - - Contains information about what was indexed and whether it was successful. - """ - - success: bool - """Whether indexing was successful""" - - type: Literal["full", "incremental", "no-changes"] - """Type of indexing performed""" - - filesIndexed: int - """Number of files indexed""" - - filesDeleted: int - """Number of files deleted""" - - checkpointId: str - """New checkpoint ID""" - - commitSha: str - """Commit SHA that was indexed""" - - error: Optional[str] = None - """Error message if failed""" - - reindexReason: Optional[str] = None - """Reason for full re-index (if applicable)""" - diff --git a/augment_indexer/search.py b/augment_indexer/search.py deleted file mode 100644 index fdac426..0000000 --- a/augment_indexer/search.py +++ /dev/null @@ -1,132 +0,0 @@ -#!/usr/bin/env python3 -""" -CLI tool to search the indexed repository - -Usage: - cd examples/python-sdk/context - python -m github_action_indexer search "your search query" - python -m github_action_indexer search "your search query" --max-chars 5000 -""" - -import argparse -import json -import os -import re -import sys -import tempfile -from pathlib import Path -from typing import Optional - -from auggie_sdk.context import DirectContext - -from .models import IndexState - - -def get_state_path() -> str: - """Get the state file path for the current branch.""" - branch = os.environ.get("BRANCH", "main") - sanitized_branch = re.sub(r"[^a-zA-Z0-9\-_]", "-", branch) - return os.environ.get( - "STATE_PATH", f".augment-index-state/{sanitized_branch}/state.json" - ) - - -def load_state(state_path: str) -> Optional[IndexState]: - """Load index state from file system.""" - try: - with open(state_path, "r") as f: - data = f.read() - return json.loads(data) - except FileNotFoundError: - return None - - -def main() -> None: - """Main search function.""" - # Parse command line arguments - parser = argparse.ArgumentParser( - description="Search the indexed repository", - epilog='Example: python search.py "authentication functions"', - ) - parser.add_argument("query", help="Search query") - parser.add_argument( - "--max-chars", - type=int, - help="Maximum number of characters in output", - dest="max_chars", - ) - args = parser.parse_args() - - # Get API credentials - api_token = os.environ.get("AUGMENT_API_TOKEN") - if not api_token: - print("Error: AUGMENT_API_TOKEN environment variable is required", file=sys.stderr) - sys.exit(1) - - api_url = os.environ.get("AUGMENT_API_URL") - if not api_url: - print( - "Error: AUGMENT_API_URL environment variable is required. Please set it to your " - "tenant-specific URL (e.g., 'https://your-tenant.api.augmentcode.com/')", - file=sys.stderr, - ) - sys.exit(1) - - print(f'Searching for: "{args.query}"') - if args.max_chars is not None: - print(f"Limiting results to max {args.max_chars} characters\n") - else: - print() - - try: - # Load the index state first - state_path = get_state_path() - print(f"Loading index state from: {state_path}") - state = load_state(state_path) - - if not state: - print("Error: No index state found. Run indexing first.", file=sys.stderr) - print(" python -m github_action_indexer index", file=sys.stderr) - sys.exit(1) - - # Create a temporary file with the context state for import - # Use delete=False because Windows can't reopen a NamedTemporaryFile while it's open - temp_file = tempfile.NamedTemporaryFile( - mode="w", suffix=".json", prefix="github-indexer-state-", delete=False - ) - temp_path = Path(temp_file.name) - try: - json.dump(state["contextState"], temp_file, indent=2) - temp_file.close() # Close before reading on Windows - - # Import state using DirectContext.import_from_file - context = DirectContext.import_from_file( - str(temp_path), api_key=api_token, api_url=api_url - ) - finally: - temp_path.unlink(missing_ok=True) - - file_count = len(state["contextState"].get("blobs", [])) - - print(f"Loaded index: {file_count} files indexed") - print(f"Repository: {state['repository']['owner']}/{state['repository']['name']}") - print(f"Last indexed commit: {state['lastCommitSha']}\n") - - # Perform search with optional character limit - results = context.search(args.query, max_output_length=args.max_chars) - - if not results or results.strip() == "": - print("No results found.") - return - - print("Search results:\n") - print(results) - - except Exception as error: - print(f"Search failed: {error}", file=sys.stderr) - sys.exit(1) - - -if __name__ == "__main__": - main() - From 3d5728008197dac308c6c8d2d8c9f95255815222 Mon Sep 17 00:00:00 2001 From: Rich Hankins Date: Tue, 16 Dec 2025 00:52:37 +0000 Subject: [PATCH 8/8] new indexer --- .github/workflows/augment-index.yml | 81 ++++ .gitignore | 3 + augment_indexer/__init__.py | 24 ++ .../__pycache__/__init__.cpython-311.pyc | Bin 0 -> 794 bytes .../__pycache__/file_filter.cpython-311.pyc | Bin 0 -> 4189 bytes .../__pycache__/github_client.cpython-311.pyc | Bin 0 -> 13124 bytes .../__pycache__/index_manager.cpython-311.pyc | Bin 0 -> 15403 bytes .../__pycache__/main.cpython-311.pyc | Bin 0 -> 7988 bytes .../__pycache__/models.cpython-311.pyc | Bin 0 -> 3886 bytes augment_indexer/file_filter.py | 123 ++++++ augment_indexer/github_client.py | 307 ++++++++++++++ augment_indexer/index_manager.py | 395 ++++++++++++++++++ augment_indexer/main.py | 167 ++++++++ augment_indexer/models.py | 131 ++++++ augment_indexer/requirements.txt | 14 + augment_indexer/search.py | 132 ++++++ 16 files changed, 1377 insertions(+) create mode 100644 .github/workflows/augment-index.yml create mode 100644 .gitignore create mode 100644 augment_indexer/__init__.py create mode 100644 augment_indexer/__pycache__/__init__.cpython-311.pyc create mode 100644 augment_indexer/__pycache__/file_filter.cpython-311.pyc create mode 100644 augment_indexer/__pycache__/github_client.cpython-311.pyc create mode 100644 augment_indexer/__pycache__/index_manager.cpython-311.pyc create mode 100644 augment_indexer/__pycache__/main.cpython-311.pyc create mode 100644 augment_indexer/__pycache__/models.cpython-311.pyc create mode 100644 augment_indexer/file_filter.py create mode 100644 augment_indexer/github_client.py create mode 100644 augment_indexer/index_manager.py create mode 100644 augment_indexer/main.py create mode 100644 augment_indexer/models.py create mode 100644 augment_indexer/requirements.txt create mode 100644 augment_indexer/search.py diff --git a/.github/workflows/augment-index.yml b/.github/workflows/augment-index.yml new file mode 100644 index 0000000..bdb544c --- /dev/null +++ b/.github/workflows/augment-index.yml @@ -0,0 +1,81 @@ +name: Index Repository + +on: + push: + branches: + - main + - develop + - 'feature/**' # Index feature branches + - 'release/**' # Index release branches + workflow_dispatch: + inputs: + branch: + description: 'Branch to index (leave empty for current branch)' + required: false + type: string + force_full_reindex: + description: 'Force full re-index' + required: false + type: boolean + default: false + +jobs: + index: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 # Full history for comparison + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: '3.11' + cache: 'pip' + + - name: Install dependencies + run: pip install -r augment_indexer/requirements.txt + + - name: Restore index state + uses: actions/cache@v4 + with: + path: .augment-index-state + # Use branch-specific cache key + key: augment-index-${{ github.ref_name }}-${{ github.sha }} + restore-keys: | + augment-index-${{ github.ref_name }}- + + - name: Index repository + id: index + run: python -m augment_indexer.main + env: + AUGMENT_API_TOKEN: ${{ secrets.AUGMENT_API_TOKEN }} + AUGMENT_API_URL: ${{ secrets.AUGMENT_API_URL }} + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + STORAGE_TYPE: file + # Branch-specific state path (automatically determined from GITHUB_REF) + # STATE_PATH is optional - defaults to .augment-index-state/{branch}/state.json + MAX_COMMITS: 100 + MAX_FILES: 500 + + - name: Print results + if: always() + run: | + echo "Success: ${{ steps.index.outputs.success }}" + echo "Type: ${{ steps.index.outputs.type }}" + echo "Files Indexed: ${{ steps.index.outputs.files_indexed }}" + echo "Files Deleted: ${{ steps.index.outputs.files_deleted }}" + echo "Checkpoint ID: ${{ steps.index.outputs.checkpoint_id }}" + echo "Commit SHA: ${{ steps.index.outputs.commit_sha }}" + + - name: Upload state artifact + if: success() + uses: actions/upload-artifact@v4 + with: + name: index-state + path: .augment-index-state/ + retention-days: 30 + include-hidden-files: true + diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..bbb849d --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ + +# Augment indexer files +.augment-index-state/ diff --git a/augment_indexer/__init__.py b/augment_indexer/__init__.py new file mode 100644 index 0000000..499dfe6 --- /dev/null +++ b/augment_indexer/__init__.py @@ -0,0 +1,24 @@ +""" +GitHub Action Repository Indexer + +A Python example showing how to index a GitHub repository using the Augment SDK +Direct Mode with incremental updates. + +See README.md for usage instructions. +""" + +from .models import FileChange, IndexConfig, IndexResult, IndexState +from .file_filter import should_filter_file +from .github_client import GitHubClient +from .index_manager import IndexManager + +__all__ = [ + "FileChange", + "IndexConfig", + "IndexResult", + "IndexState", + "should_filter_file", + "GitHubClient", + "IndexManager", +] + diff --git a/augment_indexer/__pycache__/__init__.cpython-311.pyc b/augment_indexer/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..69d422f0967250d2148ebf671fb079d7a6d1230d GIT binary patch literal 794 zcmaJ-J&)5c7`D^2Y18Y#@drGCDnz?7AVhU7A3{B$cFSVrlDCN^JES+E7H%SxD1Z`j)+g)7iuIAQNMCdVXP#cC~qRe58Wy?$sY*J?m3d5Guci~iMNDP~oDX>dnawJGJ*mNvey-s5T)hG;?0BkXv zOy{$aO4+K^tghk=u(~q3cI{%>FN$Ls`c3#sNI1*mB7>&yT0bj`mBW;D*xB+vu@4zWo`dCWNzmr=OBzMoo zd8uF=@5el@bYksHz*8OP$1rz9pvN37VK^VHH+S9V*kCQ;g?@-ccAC{kgk6hLy7L0+ q?}&v{LT7YKrvE=TOLF|xL{%d(l~%PAyRKq8QS_;eSgsLKZL9_%R-B=<){;xi zE-i_sQMJgyEdo@B6jsm#)yYYbORojmqCgM56;z<2!WIDn^wb**H-M2t(f4M#q^Jso zi=w0DyxE!e-n@N(ebmzvBQSox^{R2NhmgPGi$A)Wi^so(#Y4i#B4G*(n98C8FlYvf z!Ns70d#b4xLyIAW1PE8|gnb4q^hhD(6Zm;!F~Y)&Q5NA{EXrdAm36(NF2)bM$Kbu2 zM+*vz^B&d>Kg0{Oo_9iv36|i=CS4NnQeA{SBWZEJn}>MM3I8Gjc+V7~ECqCF7MLfQ zboF9v)-X9O7^cI8QCg;^wQS^R!4mXahI6g5Lh0KXvo z9;11|c1n%hC7_2u_4hFfV`_!xSE*5;Iz=ATUa=}Bqq@1KZ`gE+Q)9UVQZhPX^L#}Z z&IT=V-L43p(PEOrq*%5mPsgHpyrxO;hLrLEO;%Lko&t4%N(zy; zN{()nY&xiEgH$guU#zw~n!K?Z(6nEWj3PqtbvAxC{qp7U6|2a{^~!RQmz;51)N zI$aeBU3JN_OgHiR%a&%8&#fL+Hkn-_7wGX^dX+Z%{*S~k5>4h?>rwoW5=pD@BNT(^`W_KOloo zN~d{2ub2*ajz!PWx!0a1ln?N+>6+oJfFu~={ynBqG#pPV5e4;t0gnL?y)bpyE)BMm zx@ap|^aGNHh)I|rDEmow|MrYSGYssc92DxHQIhKRY*P(YoWj_p z>})3Bh8fRWjJxTDl7Wk9`8p$nB_IhO2Q_+0*o(QImfe8EC2@e)8A@=gDXBGYz7I`r6ZeviCvdL8TFd-6v0;*xkV7hkYl= z`?2BRBV{<4N3;5)m4*KaI}mn(9|UTFQrPj&0p1Tf2Ph8bW)Wfm7G&xNVDygM9BH#S zoP}$_N8lWvv>e3l2Oc5kKT!OPu<(ceEq<2;7aS=~q$v%ipXN9$fsIl}Jv`RMVuk`~ zCa(VlMC$ZOy&2?Qv4^S(u?kfHg5Rn@NJ`{pnWpy!{lh_ULnzgwbWlJGV3pt<3bZzO zAB(pa?Jb7(s!=X;NLiS0j$|dB?qpI31b{Bvhw1`j5>Y_%y6w&^Q|GwSAPsH&wS;WKqpNqrv z%7Y|_ zM22FU;(6E2tlNu?TFy6ej#&;okgu>8kbaH5F}iFug?^fIEGuW~VwuyCYQ)Q<88^}H z53nP0<6c6PMMs>4kekhf-M%9~Q1*zjCb>cAaRhc3GT&uX*_6oCC<~jL=7j>!JH}m} z!#a%R+QzDc`uEai+UvoaU8#5dR{>)Vem4FK2D$Sh`P!xW3sc)?w$J?O`rCE2|BEn* zp4=YVjSTEX1~6+*EBmKuee|_@atfd56kv3p{V=~X{J!~tSs%T+8-Hal{tBXg$3xBj z#{S(mepUN*tv;G+MzB{)2t>{hsrEmK}g8;yb0UVx2$k0e5GNynJ!L%RiX#@d& h-ji;Ga5GGLPj64Xcf~`Al5nii@KT1s3=d|9F>OVnGgL?;q0S+wQNXlF>RwA^K8 zmU?Vv9l4mGz#MjO+@nnKeWtf?? zz|8QO__ai8mJ#^*w3x`Gg*yz#X4t6fx7fpEgw|}(^4h{xNcr^CiQ930K9fkNeoI3Y z$oGra(hEs0Hj_wZcmX3Ga$P4OK6Z^w&GK=b8a-?%|M;{Ynm?d;3Z{X=DFF6OyoED! zmKh6YeaA8l%))Ow{BvMFZ<%p$PRO@y$cGwGW@@XB0`*)_&rz+{0PWqR5BrRZ^FW!i zTIQ|m-^KZ$%vCMZ$8F%<*FBZaIZM_~QJZr%jNb_Tc)6xB-4@OdzkStyYxFb%J6^at?k!)lI1#cF99epUP7Xdu&+Ex4HSb0(F@B-msk%QKm4JX0pPEPl5@tQa=M zunSO@$|T|}$`nJ0jJm3QObD~$SoIfnOlpi(dYk73F`Z(Q3>%O0qNtZLAljAbnV+xp zSs`mOViu5L%^sU2vaQ)-q|=B=vxt0hM!>}^B3}&i?9iTT={bH6yD&QkWbP4#IHqS4 z{D8n-t7OL#DUQF*3w!hqV>*w~`8%2;24#s%ES3%G6zQKYTFDOJP+hn!c6d1o_iXE>OKx#!>l2QGJxq zowH>sYaR+XlVpZb&9$*Pi(~@|97YNx+uJ72nzOxSyG7rkZksPtx9EtCod!m#(r}&^ z)5)7W!-D#nVRj}HH+ZJ!^ofb%J-eBnIX01k7$kvZ;_11$M25L=dLjbCV95AFQ6m#R z#+@&QqC8 zLUm8p)*#%Jjg1Gue<~HPw}LLRIlS2<}p)PP?~7DVd=w zh(%=n?Xg4DKe{G%nm@9KCwk2v^;#iqSkq3Zg=_0m%C=E0!@9-QYr@qFs|nYX<$b$x z4bi9rsw@9F>$%#?mO21O-}G=+$!r7~IoPW?JB(JH=#{tE_%ZF8d z-T>{EbL5-}D(B+dxd!;>hO{TsT4_a5*WHygj9f(k#!++LOh=^_v_KgrkL2Y%kG#fN z8liGN$yY#O-t>F`IUiPYk5h~EmA}h1N{xlO8K@Rh#@Zr%FC;Z^zAVs~YmgeR`ztjI zxZE~9+=iLUHAzhcT$7s~&TZrtV5g~ovb*V#nv6Abe#sAXalmiOrpk~ZHEyWiEH&Sy z?@3Z~q3#RRS{gad1tf>?qU0!qYBg#p$#3ji=#kvz*>OvrLRAWDEkL@OfLD--8hC~l zP(ku=O$dta{VFq5t1tPbz$3r0el92lx#k)Q6%N#m8hp;E0WP?qmzL*ID78F~La7BQ zT%_-9lbphS$@vv$rMHg0=grD<$K)zwz6wYqP-w5sO)bT>8oxuXRce)7lD|x?09awP zRvWOwe$ZH7{-r|Cn+Gn0J=cR*u1)fTW^CW6+i2=C&ez{okcE@t8u8EgEods$&`a&3 zM9VFcjzF*-4S$9)wq*{~xR6{%o?-Ox=Ftu9J0v&M>XaHF?2U{+iJtr^x= zkA98=7D=XA4z}mY7Rs;yfLY-x3z&gTaSVSugP?&ylT55b3!6dp9}DSlWg_ba-xBH^ zPs9=avtWPDvGen2k|NQ}Je#?OpdR5%0-9$wy7|Wwyl7AGbYW8xG=9vJ4BP}Gd#NNo-kMgVlsA?+l8m#_^S+if`Sq_W}%_e39o}Gi1i8(&K zkjaK?$f&|rWxz2;K(w5-2xGHZFeB_peERBj%`%sU&a6;2sd>saL}E6T7I-u^kUPy@ zW=eBFBLH|jhc+4uR*hy~NZm-m3~ElWEA(d1tP7c$f&H4BgocBLlxoaon&1nbU}dz*=Ai@m>D5GS79&f zgl-9GOxD7rXP9hLS*8FRBcb^Fkt(A;?fbJ*YJzC}ah2G2G;1$wcag;Kgi* zz<+ar7c-(}h6q*6c^`WO(t+E5ZO)#4n1| ztXb0YVAmo5o#W@Ol24Zq^%8{QnpK=nCNi3NK0U8luBQ_zt)aXy^hz+4jtSK7L=Aa( z`V$f^+SXe;9^~H5{qWoOzYSzIbrb{P^~P-vzo|4biw!V#XWx6TuXRLMJEBXX(lMlV z3@y5fjqMM-?|QLPx7ye(H+C1pT|YVfqtk_HCA?P+?_G4Rw=$LPpc5!P{G-E5qsq2H zb=%;gqu9JnZtjJlipL^@U+kR+u`=OPDvi-EW{WQre z26_st64^z@T1dHQu_^bK|N4LSA=W%L`1lZC@dUNUTIT0j;S5T7Ecxj4z3LxUL836*!F~0 z23}SNUVid*WndCU7*zuO{}KoTUx5yt+_tsuk=5>z<%H6GMD0GZk=(%}YlFvE2ai8~ zT|R$78JtoFr{qhoDTA*;X9tzQ9@3e;iKmNQyVtsgR=b9lPbppFYS;MU>Eez(izgmN zlt3T6pFeGlAn`n1AFkIp89BB#a%y$t)Dy4#+GS;AS{<2|ue_;@ya^+{qyz?^2HNEI z_|gFg^j8URYJii;3+a**J4cf;Mx&IJ%qT#*80q0BM5I0=VSWDd(%9E?D*rA?lLb5W z{?NmPzZiROO!k&v#c@z|9F!dgpE^9U_l<{92;g11p~p`q^;mISQ5{!g$CaWlkavqO zf-3k$&x!t*51N1FX+IOO{Az~@(~*~6-ev#w4l|^G-A`kB7fBzqynNXHPbL~d&2p6$ zLFeNx9mjpO@}Sp=*vjLu6B}KaU@N12Y?3S(zHi-xuD+U;Y~IpS2L7)?)vBxzgRx|m zEJi$;v&~Xq;o53e@^R|QhdD>Re@t@JS;#BwP$YQj3b56-w zYsXn{Qo<$l5`KZrTu+N-gQe~QB=3w>J$tErfadG0F+lTuxdsEywbk4*&LAw#5o2IeZ>lhI1ZldA_9uCXi0eB0?^%z3M5mg;g z*%AFJSOk5EJ63}`mMlx05*$*4LwVPFPhZ};9_Um9eI?50gsl^T{PCy3t|FkDE5+7r zYpv1M*632^(YKV=gKF!+V)u5n`^A#o(&GJug5Wa@@+V7PsPG&}AAHF4%rRf4} zM!joYBdc8_%lyjJY*V`U|j0(9lQ^$u{%# zmB7zClGJBQqJ~g}#*lBdZenveh4N7}dMrHz;zl+*6N+x zaKr*C$->UnoBpS&+NW#}Gi-)o8F8MEgXV~%!@kbo{^MEuXke60PE0a*rk8*-JUF+g zIVEbgI<8hA&2SWginO9ZfQve^xAYT`BN5OCb!|W)GhnKNzO8TkmCi#5(Fu{k0}+4ZD+lVp;#CMiHSJs6sjBbV1)tAl@Jpt(446_K$K(x$RhgCr`wWTrJ`^OyV-Oi zf;$8`79pCDSh<@{L4;YTl%kUeN;>Pml{z1@R~oe>W?h-Et?hp zu<9Sq+yA{GxH$WMMrr6%8$fN*&JnPS9%kh5kpAlRWcTp8uSNB}u;v?B^$je|D!x(G zH!3?u1@uB2jx*4%CM)=D5N5auLFeG6L(WP-2o!C8&~c9kh8v)VEmOC4uUFGs!!S4; zu@%7T-28k1Sv8ltaV_VpsYU=ab;k(+Xv_#sQkI(25Wv&sZXt)zZP`S({r9BX{y(7G z^5t|(7Rfq;_d=`-kLtC`S#%~wtDYu;bD{>H6}sbyJBAT?KoM+)Pu{6tbowDCb2SaR zK=&>X-yI&sX5b$)FMz8?fG7bDxXc|moCW7t?R0;`VE_7^cbOaforHLe!DNlHFS;G@ z$8uy6PivKZbdGIiyv7M?x~YYE&{NRA@%xj!JIHE;Wp*c{=;m)8rDx+yGq#qq} z;0AIL?-)rFWd+)8i-X!aw^dV>{)#%&f(8yN*&-S#I1~5~bS9vqJlk`M+(oKfgxJd6 zr$9aNKOg`F)j|F z*SK&P@bvVJt@e$rj6Cj>ProKlUs3wrRQukP-Q7>aeQJ1MExdmVKLxQ_yz=;UEt@^H^G*I1X?Rx#8N92-okp{h9E)<=!pOuiby;$bzC~7 zXbFk9S5-xqhacH4qv&ojm;MVXiHH$EZd<69j$){%*vi0La#~ve`a+O*mE2Tg`w!mw z{#y@c3fC3iKGnBRcI?x)OZ<|Rr}|a@9&)~G|G$L1C_l5N@7wF4Z{9uwR}tTF)Z5R; zH=INp?P=U%*s=O8=GZkpegk!z?%vlA5g6UHE1_VI32|*ktv-+TT;+gcP{Sf>){U;Pn%%Urv9NZ1LUaj^$(@p9k^?;_SdtQ!$ zZ$iVhPJx4RIPEe;_X;;4skzGsF1RwqvgF?-O8c2}e3nn~x95eIvcm@XuWu47f5hl` zG93qQUW!&*g@I@hY5e>*)Nf#A$o@%qm!}qo?;k3hm7DkK8TBDq3=wM$XA5y8(#)pr z5U0TAP0Hv+5RiT09SG_-p(DDBAg~qP+3qi8&53-ZkPp0C^i^Q>mcD>cpZvn8-1;IM z1Ni*)Os`C-P6Dg+V7<9@ad>eUF;xgKm57YIJ#v8hnQOvs{>Vl{7-`1q zvA99UVwx)!1IOnA99zaTcP#eJ1vXhOvBzRuIu6-p*zyIu4<#@VXci7G4G5R<_Z$WU z?piYlyMY8h#irQ4ke45^W3uE!MU|bBTgz=@^NYp+5Xt(Ib+s3--QFTZ08C!+04QV!Se> z$B&1fn4esD;`rrD`tL2mtf|N9Dh5L(GsaA>9`B0k@#p~(uRA>@E0mWgV{n=dSzjcL zF~-IiV`Gd*_LF$s=`LB}cZo6v!+qABq%FqS7UTW_60bYG`P-_GDGV#V9ZMlKGPW|M zL?&e4gz7w6vO>)gWem>Jjn=^>;2$EK28cEWea+TaXfh&6;E*JN^pXT}iwVLmCQ8(% zus>)Gkv=fSE-=O(FvbobChZ|EQO^lnH1i|&e5HsZ^({ueWko4Jb*k{ z$dvdq2*58((HOa6+rM4L<*-Xb+98{c(mmu!5C aRHtlwOSY>P8fIg$>YyS=KM#Sp? literal 0 HcmV?d00001 diff --git a/augment_indexer/__pycache__/index_manager.cpython-311.pyc b/augment_indexer/__pycache__/index_manager.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dee5c6b3a2237157a7d5c62a4328c3e578c6cee2 GIT binary patch literal 15403 zcmcgzU2GiJb)Nm@?sAq~{x2y}9{rJAkzAURY$voNn<7O~ibOdg70ZmB?TRxbS6c2e zGqaYsCQ~=a1JS6c90;|F*yy5lT^UkPJ>)?sn!rj5#0C1W%TB=U04f9odGJfSh8)1~ zL(%Wt*`1wPQm*sU42Ng#+@EuQ=X~d!d-orL!2pNrZ*IJp__u=`_bYm-T<&t<@pS~= z=0t9m6CI*6>6mq}r*qbcrz`1Na?iRQl;%!)mb|lG7WO3TmVC3mCI74+?_RMk8Ca^H zt!H6hl3xnW2A3LU82yQ ztBG_fo{Tv4;CMoo=G3utN|mmux_cs#lsGiB;6q`>hlwZzEN+zl5Mw_P9xGL!l=cVPel2Fs~%Bj?R`ePA{6Q6?p9JQ%(`O1OE z4M=&LlQ;}5N5eYn5}lG;bV(l3EqO(c=$-e8b>DN()`_j64^N-yk^EAiv%-m+cZz<* z*NXuu!1B8>fbLly`GQE}5pEEJ_A*FoltPH}TKQPQZm|L7n#4wX9>&uxhLF|_o~RFX zxWy*Cw}{PBtJGHI!Rlz0+N&{0e{y{-;L#y^G0AN@&qlgD#n~fYV#bB}v@95NC&-lv zNJvUlubV5+vljuHxqw3R6kEo^(7m#xX5`e=TxH4AsJhDkk1rwcHn-|nmG*%fS z4=YM-*;?FQx;nBW0)2Rbnr3}q@xnYlCB_PMD+xuF(5huyC#&dW{6A~zFh0a0iUXmCD1I<%Nxl7`}$g(ZC1 zLy9~{;e~`WD90Bo(J|IRNgiU)*peY*qRT628PXDJES7CMMPpcAo6$;iJ5}~H$eUcL zo$D0V-PeP~j&3~o;*S06Zp{~d*s(V^q3wIN(D9ts@f@YB&8Nz%g8c@7VB_nDWT#koTurJr-b;Nnu#YX@zax|mSsG%m{0_WS1~Cm!dzSdy}Bqt zpr1T7GkM{747r7+#KNK~T#|%6lP5;U_XzmX?bZ~IPfV4vd)*hT+cyTUz%@Ep55{;@Ubm2FCu8dAKwFc zoAYz3WocpZ%{K!4yDs|zd4^kasOI9~ILo4Q*k#?EQ*_gx9n2lvG`E)<#@Bh(dCm0- zch%wGUV#jR8sMfBwStal+dfW)it!3dxyl*2BcAxT(0Fqdo=L|=LpBLzNh7D13~{5Z zK+Y^#7Tk%8Gbg6TPfeW^X3h!cW+qRZ7p7;7dUi3w#yxS&Y*E)X1CUJ>S)jzRoR zo|v(S$jmUnDAG4n(Q1H>1uEeq~F&q@rvYgKCe@ zGaaTz8<+uH(}-a`GC?y!gGemVp>vCy{Z8HSy8KatUKh-J*+(+Kz#;Q zr$y-3Q_zXsP##ip7$2wZrkT;*RFU39s`6A?ok(X=;t5$!%N4=sn72v8`WWRma6KEg zi9t&Wj1o@y5eVepVYfg0u&M2asI~XpcHLXZHytZ99n+eQl{iOm--o*&cI>#hl52Wv z_2z2Hg%}oHC7)2PPl;6HwhMjs*=O5DSK~hWb)$ecf4EADEgd%w-#CnF{b7_>qG<7V zJ4Wlcziu6I{fI~4pWMgle(FNtfx~gE?tzm9>jI-~t_Q6SP~F28|D1{8;9!d95?jloTB$`ojpGU zx{v;9>lA%Y1OTK&{P8X*EanF~9fyXIAg!w`bRh)U%Kd*4?h4grNJiB2`-=AI-%7ME zXDUL41pOBa_EQi=ki+<*GfeBTAc!H+F=TeRUzHetED5&R;D)rW4s4&ftO(5LvWxF6$z^dquqEL*B=}K=Ox6x<%?DEorj9zZvMe3#jV~`J z6LZY+8ltkjHnx`0fFK2WlN2}BA{3Cz8ATQ10_g`rSpn#?$aWPKM6e3tP{E;fUy@AL zSSW@;DMB4L^b^2BJhj49?Wiz;MFccg<;10oN(u~9nv?-DDV2l>FH1&&>^J!_d^gl^ z!ZlJfm`(w;7Fb_yaC}ZqE6|$YA4!lpaWySpf!$|O#kQKTtgA+^tsP4X^ifu!X$DT5 zx@FXrQWZcG;Z`htMR#14`yhjoBW(swwBG+)qK?`6A zu)_AuSqza%UXFVo!8_(FELtvSVwD+Xh41#^TVamRfb$+c6C; zN~21ihw=r32fe;y_WZk4zXfM2`L@>2%%9e3I?SI_Yi_G?CVE!gKjhwZ+G}udDw(O} zi@dH3dHuGrq?uxuO7?2`TCHj8-N5_k^&PY4b50mv^=t0$z%(Mz9WQ2wV3fU{NM{t|a}|`ubS5bZsWfvZFijZJl391GrP(Id z7UDIJS|o8qn9$wxnPgHXd7^WXI=!ykI^9hISLaC+QCKlaEWeFQNeOaP=jRrsxhstG zDN*+u#{cwUT=&3GOv_V|2HmqPV}zKCLEeojTcOalZ?^PrwDjMeDzrSWwLG8apa1M( zBMg85x?+-td{e%8FeicFF7Q!}kLLO4!pJ|fz4O-o zw=UnjTx#^SHhjW?d`hIm`GQX-kk_A;JgAv0jeQu{`=id|ySWF??+5+a?!ee#*U$Qo zjSabe9&sT2^C1tY{B>m9-n!_4+$9IMf{h1~VxqI^tQTF?Fr)~07*|y`Utv;P*;!N0 z{e*J#1+mL0Cp$$CtJAV2V5`T(ge$D1++?*>#JXDfM>Yk@n`rO^dB1H>Z1tM$M8E#e zw!II0x?^J0niIHz?U{SWU%h#HF)g@ZC zl2+YUSYPBOv0miuX+Y*6A&=Pbu4f*5aW?*e3P*cV=IpPsq^!WY%&oH(hDN%Z{OgpdRZPsoy$L`80#)`m^}wI{eZP>%T!vJV|%5F z0Gx)0QW7K1i78?bzy{qMcHJ_{B5lc05oIRZtften&9Z{+9u_6^WnIEC;d4hs=hZaO z%h-EWvLQ1k16-kxwJNcY!tQU3Ma7J&#YnIQP8rh4r?6rS=YBAu#4N38MAc*vRGOd% zmg3ipPAPiJ_=$7CAG%>+Y?&j zp)wwCG*0FlCm*#8=Uaxct-yEP8q(?y+!pU9wdmwV{bas=l0jc@f$sy;Jl|Jr5pEns z*3RyC_|5Rajqt(BhDG_=T?CgGG`_$5-t&py(+OM$Sr1~TgYTsFf=_S0r5B9ayKcU^ z**3h)UQ5w0a@;{6>8=UmtxGYAuG& z=4S8twf!Uc&{^E~-!Pt_C69~1*aa}=xwg)mXSCKxzV+BI2am1$iBn3e-+$Y`Q9qoo zA7-4UawE4>+TNr2&=l?uBgPZd#tG=ZbF*_~qjTg-bf$|s(-l~6pi$|fQ8`#-Gx4Z# z=lUAk(~+k!*EVb?D}#{p@!tYHwpe41WnesHtn1wx8^%G>)Rdt)6M4YNypY4h;DusQTSY?No~4hW>iD?XPK(lo}UqY@3_{yutFQQG#L@A zTP$n!scJsaF9yD7e8u`z_vKI}KUNsOeP31`cY}7@61dus{m8`Cmn0c-89R3dq*h?Y zL{|&&ZN*dr+YBr$*RY_+# znHc-LZd9b*B_Z3)Jp0(ePaAqrG#bS&(r#+?MUW3%FshC7Jh1wqh0RjR%OErIixecm zC69stf{#;pjL0~V6GSG6oCMK>WwZ9Yp%XP(6oin731WLx@GM-4!}=%RO4U9$OlMC5 zJIjjhN@6pNbY7j0i(*Vo8;3}AuXK$zO7%L#h>1B>4ib+eL`ZVr41wt!h$X~|#z691 z_muSkL#;762FpD+oEdtv?UmJ``%}_YBYCU#a0aZiT~D6e|37)%wD_u3J-uaK9Gr&o{ha+y&o@n(xKD@5Q2zzaHH5 z?b-0{$sH^B4r;!GdEY@p-w1Ao2R6b3g>Y00N7sYJ?gN|M2RFJ87P=2>-Eg#aAzSxQ zv3vJBuWH@TJ?!c$Lbrg5&{qV&zpAzM0IGMu76*ZwVMjx$u5LG$GswEXgb_i#;SFE- zR%gMtSM%-7`}Y32v8&h`E;e;M?A&?lTZPU6t#hE**;VZ9(K-+3J4fz2%T+vG>=Iz< zhXSzlL+CFEEPa2#IGABYTe8#`eqVznR*T=YHm_>pF%DsvTtugcy*tr!*IR3yez^l` zYc3oIv7noEmrty-SaVL%ch_&nbYSF{!gKg5E&kHWCaS7!=zsaD#b2O`*{aJXs@h`d z8)Fx|m>8StnI;_5Gm&rAm20fzvp&W8yL{~kr*`ax9`NP!S3IL=m5GQsOHZry+1hQ| z<7j7Y$~x$6f-u zRmhfJqe&pW>D`(#SR)>}3q3VwBxmtmKe;ZH!|^OzIXZ-}<=U#v*veEpZz)-$kJYW# z<>(;Tma8V8Z`Eg;F>^LBW6jly#1?2@@WtmWe)lTtRlj{q@ojvynhH-$t8Ij9M$?v7 zGy1D_m%A&RpqF7s!}0~0ZN1z6J~V_o_%|--5gplIY|+XrF6Hu;vqhEKs*hFt83r6Z z*~ejF^1JF|+qJB0I15?9qLgiuF&dShLASG>C~e1PcNoV>4R4Ml<7xWQ2Pf!|=@QQA z=w2l=KcBd!d(lbu6q(0gR>4UoT}XZogmx+8%ZV5c0~rAvfttY?^T1LZMfX!z8Pjr*lwx_FSQ0%}a$q;Ot@BprJG%<4eOhZDbP#8~KWgj(r*hxY+7I7LeB{XU zrwjaPjX#Y*k#GCUbANuWz;|nWcb@MChwcN~&XI?qmK&$uYPi`@4DWo0*TRG4XMd^A z-3^5YPacsm-szgHIc#1zT+Ex z$3GnTXh9qMMxpOzt?%W6Z~r=fBmHY%FyC+yI^mf@IHH9kc)|UXu@4&yM_$s7ykxwC z7JRR0zE|?TSI`okozQxQG}E)2e9s2olUptDhc*6iobt#D>v|UBNg5*vW(%8LPu3WBk;eFHq4R*&d4PhkLg%2?Ik=7HSPIsW z+6eL~5$L1-fI`~OPe!9-PVVQ<@K~1%E>*+tYV!q=1Ci;Hx3Lt-qFi(0-?i!(=U)4W zd=u3j0@3k0Bo|_k)41d)G&xhJti;mOrbW6fo*+x<+H+>L2zPzdR=cab$Fl>v*0yyb zH}qlS+QMtW#>-vi^$o6hTpU$*wT0WZ z_*~s>FeNGE>VSb-BxMb`YTI~tjRSe#05ezNk0I4@DiWq&#n70?VtODJ!(3*P6y{^GZ{r6a z<&?TuOia(kVoZsU2~G{YffQ4DKN0${Ws+#@yn_YRXAq}*RNBtc-IvnoB-;Uz$&n!w z2IJr{7O_mmqp>7tIzQkH&p7??LM9uG5i+He?_(CY4&{N50*TL$Nbue5bzwt=D&@EcX`6MF6J(4-NPjg;ac*dW54G>4#dz1jHPR!kAEM9VHPV{MTEqX!KIK4u)_H-0hZC8a&77z>51K=LCPOOAQv=IB)- z!WPU26r!KZFd^}|?})LqkIDatI06LaS0Ff<;&7ClP6sBFh>vsdZ`RXak!yUD{S~=z z-hLOk9eMlxh&!0y?taAe=IwWpo5_E<`%!ymo@*(#cI3IHlJ}~|fl1h&e!}8EeL^Z5 Gg#QH}YfxeU literal 0 HcmV?d00001 diff --git a/augment_indexer/__pycache__/main.cpython-311.pyc b/augment_indexer/__pycache__/main.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c31b228160789f30e9711103489cc0978b188151 GIT binary patch literal 7988 zcmcIJ?@t>^mfd!{-TsMf0)Y&WPJqdnA2xvmG6dLVGLR4m$pDEi?Ce;My8#npbKULy z@R*HfG#}XA=$46CVa;YW{&32&N>0j$J8A!fyWD-~j#?V4C8S6vKFl|dW=4vl`*5$S z{i{uOGCAqW<*HXzuU=KXSM}=E>%X{Mb^^lJTfdC<9wLbUMuk$Ki@^8K0N5coVv*oT z&J-gTNt~M&O*l6%njtsGs5Q%?g(NtNv&85%>!KCIbc|WEE!qIK@^;?AJC;n%r4fiF z6UTf(EjmkWxScYz+o0VA?KZ%*1MKF229zC36z7C;9aqP>AouWI&VAE_NDEFA!FwJ7 z^te0{jBlF9f4?X7nf+2Y8fW>qB-~}!6VbTDE+qu^d{nxSyupq~q-Y|}PQ^L?4lmg4 zSH)8fu+*@Jr!1j4VLefy;oj6kXP6ftQj(JW^g8 z0OsPqC@ZO|Z57ePA&0qBDwT4Md;}Pe%UByknh@(GYpI<8t{E{tC*O*`+1Pu7Op`VA zDI><`WSU5mOCT@5B^5l+OYHcSDK;YT97rr0j*0A&kXQrp-HHl{_!?BQx57d+d;`Q1 z(7fYU&tICDT?j#^(8A^EiP=>DfvVSbW}_l2@c)7sdyua0DC3I zhee(hVFgi%l@jdTL{eZSJ|2!sz2Z6_i7rJWED&ek;{(fq9`?kFB(00X!5~8H)e!^X z^=KfRTt@4RBse}0Nvs7=`2Cb-N{E`Z$OX;3%uAa6S~!;ECj=oO_)VG(dWNLLO+Kz! z5ttNWBH98h`|4D1C9%c_i)4eM5JB*Al~)#_o45TaBi3&95z zL3I#f0+I}|@9}Nhcia!$^3i_9GoX3~GE;fSp#ni#E|YuiCb{{N;+|FAvobTg@AlkZ z{X_hBak(|1xPz)Ym^q(!_zDDNu>)$sM7pk$-v8#*prepO)D8shz^0aL6{K{{k>aSEn2P90 zlbofZ-UJ59bfpZ(!cn2ZSq~fw@?)6WhH1n6R_kI8tc!Z9{XWH^7;?E}N)tR;w`?GH~Ul-@Dkt6+9{p${_*P6E0$i`!w zH>fOA+MK4-)+GwmmN%8em%){=0Ow!e*Arq?O2B0kjl)H=7RDFMPbe|mc@C?2k$o?5 zJI)I|m`o3QLkPzsD?Mx^DF~nuLZA)SqLM#=yJ+6?QwtZaeh`|QxN>=ZYT@$Sb&YsM z2ERpHTEGc9AgR6sXzW^2l-L_Q%n;p$Fh%w_9yf?)e_XSddrWF|6*x3IerY1*5cnlA zxWb1yF_^OBf)ri`n1U>*(I3o>&z`%WQAm%JDa~IPPjwe50sKaF>bDB$a0pJ6xFFFi z;(9D9Y33Lo7u=}AE=plR5^uxB>t{5WNQv*7WgRtW7Q7^lLM@uTctOK_wYmx%iig*D zjn?_Bxs7}l85jpy0w@SxG-v~m7Wq(RFi@MoK|i#0z+W7In>9oHXeWG!wvX(zJ!q5J zfv0nd_l)X2lew68Hh$x5dG2h<&MVGis`FT8V&7`tJpEbfi`3RfpWn;e%ksJLhbx(T zyZqDfCy-+uI<2^8RQHU`%)Ia&{%krkv3cr8x;#S8xqCzLo>aXjGZ*(7o3>YX;t%3- z=iuK=O5<6z@oeVu3#<1VYs+(MOZL}_wOh4z7YM7RKksEX=}kIstN+H<_T1K%O)It@ z)z(uWEU3|+t&_ceh&oae=;LyqeQ&+Kx*W6nu0HlAz29(ndvWKJ2cP6-o*q{kPOA;4 zGqZU|Gr9~e=$FBy;y$ms&&$mD7Y@%>N0!dHWk*1!0zy47&!F0o%&}_!3OM?XY+c0lX8=(B&O7+xUE7rH_U1LXWFcRWfF{F`-B0xz| zX$w+RLY-7sY9okr&1DmZiz?03;4nrmBAh4mBjD5HGOs}SSyO?TM3`g2Dh6{kDZEkJ zY0jb=(5e3t4g66n1{tuyZmoHtKTA0^ts4zJ-GciBww`mO6Dht^Dp#i!o#59L~ z&T*yC4$BwN85K@IqPdDk{T$Y)ntd&NM+ZfX0eCVB=B1yhQ9@YpU>gXZcmdstni*ac zFub#Z zwRwB5>laGbxY{+YP$!T+*=o$T?Sszh+6g=eqq`Lu=S}@->%lT%k}Mlqw?)NvUl*0^;_4! zY};81_+83a8=3ByTnJ57 z_yw%?6Znhckifn<+`N5DZtHv6{`8h|cuYM!CR2y@9IpGre;E7SSe`i~H%`leF_hx? zo!NxM1xbN06~z^)Fpwb7^AU0R`~R`jN~-q3NfI_fDocbQjH+~lkbpudq=|~`jLHZk zT(uj}4p=SA79?qga+wa;mo=)AH0&jie$_b08a0kpgKYt9v@>ydBQ}k8)@vwMDdSaS zW#9mm%c>P%d#V13)wnO=GCEvp17(gBJy~-C%xQBD*-i(7k>53IW8SnjpjvgwREQg? zZcqm4*nX0q_B6GM>^U4>%d2WhI{>@PyEkGx0UIgdpGVsT*vQEHdDw2iMz7(Yhg}EQ z$j14**wB8c)Gi!0@Xu4-_PRAgdqcVJkfE=)x()S}dP|YQjU?iZcH$%^yqDMdQnXKp^Fu zE5F``SpjX!6!QUo`wp`yr*4?RlXwZ0pdZ}B=w2IU!S98gT{ro=*xE`p%<*C(c8f;_ zDS9fi@YsY-&GymJQQgT4BeH;$j7cI3l_}eNG7Yk#fN;1 zmv~%pZiSEB)Sbyw9|A^cj{0!kIbOfqVWcqJ=YU|vema;jsZq#=5YT<4xr)~B<*N%< zt}dh)QJ>3b$|Av3MpJHV7ZzP}T)E8Y6M=H1JdaQm+8z3;L*hz!R3pQ$Nc(K+m>v6* z(Ym1(yxFh>wm8sIG8VggEajREOW_#HBUd6!k%bqu09GxwH?i@6jgBf)_9RSRK)1L? zgLe?*Ap}rO5GCm1)qH|3hoY5>CZt)gy`x!+V`~&zqvpg*#8U)EtX?WIT60#GH^pf- zeR1@3DQQkU55=N!Uew4X z;UnP0YixR<1+fVwO9&wed<2_}vdQ+KLiMRspG@@?s56#B``(r>Ten*`>Ai;b+~hxB zQX2ZzhW<_aKI4+z?`N+){1`HbyOU3*bx2``Rc2UbhW8N4d)dZ^t&l-f7{AK+WyX&i z2C~=G)|0Y(0HVU2QkhdSa|*Zkvx~CZ4^d&dRHjR2y1>&=&#Jz6cW*tp_wV$7xn$op z#dl5hUE8#xrlZ;0yYy4L;ybJQ&Td-sOx-uk@#oBOh3Qb44w>n|lO4$|sqYTU?h%Lz zGpaJ9GBa8LA4c;|>x0%Wk8U5`q=A~vExWrErdwsYWu_ZXs4M&K!vNGCXz19wwf(8= z?trK;ohs8QGo84{=^U-Hu(Hz-6=qOn24!aO|J)Kptm>8BL5K>|r!svq)Az#RRUJp+ zp=BG}b9!Z8ae2NG02Sw`>Kv7wqkwdO1Po?d6UOPvPClHGeO(a$t^Qw+$WFi^bZ1U)cKSx6}8hH>ZIO&F?(4f9?Ltox7(r4XI5-o36YA{4aLfnHQe!?BK&O+0zX% z@9E6a4;`|n6Jp*om}^vzosvC+5P!PVmt9iZ0r?ac|E2=plTKTWh&_#p_qS zeps68%f#>gSC|2n8IViyEwb2FSeSgKs}EQFky{q$Fr<&~B1Y<$FNB09`jDO@gL^ zT$!QJUkUv=jclV=%m@%l#`rN|6^h6>5IZ3Sp^;>PqDiVipu|Iv_KXev@&wKhV9(&+ z3!+Ch#uvm1*%9>${b^v^qB%RYU3|A$d5fb)|G5HNiP|nTNb91w7Zq8Fo W=EKh>znIM6U%^aJN1z(B@qYm`Eln8! literal 0 HcmV?d00001 diff --git a/augment_indexer/__pycache__/models.cpython-311.pyc b/augment_indexer/__pycache__/models.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e0588a6ac9888ec59fe52985a9b560720dc2697e GIT binary patch literal 3886 zcmbVP&2JmW72hR?U&{|!A91U?vDdN`m4;2haqAjsVmP&uSa!`8wtOfr#Bz5iZoS-9 zXO@vI^gtCkxJ3>;7>G~pp@4%ThyE)|Acw_VN}#9QC|IYQ^4{!{qD)#Xx}1G_^WMzv z&YR!--qY8`qNc$0_ug&$-+4v(4|YaZDpp<}f%27NDK*7Xt%Ret6SahjdeTX@NsY8q zwUq1=C*97}GEz@D*> z<`di9(QSOR&V9qDSM;V0&ox_yyF>XE-Sph1z0+X^e!~^ra4nrt-f?_h(Z0j5lvR<# zH=B;Zc{w36zp;JF3`b;tyNh!S2S?z-tWDeWp|h}QGiv&GJlCgB{YUs#a~xn$MqBp! zXVCddp-N2!I3yke2x>`6Xv#`_2p*{=QL_kyFC|J3>8{6Z-(%0V+@`m`e%yx@!h+j! z5i-0JE3Sx?x93tOh-h7)@-^}6MmU`}$MiMe{+ za9t0U0xRbA`ahM|@LkIHH#S;cn{F7Loi=s-4bDu|cWio<8Lg4IEDaAka*_nFM4Jvs=CEOiR_gCj z>PA>ZP_m8-xb}4F8Vx|C?V5~Y6b)xQrXn9F3INhEltdAL2;geBZU0fr5E>h!qFTJu$*G0Y8jfP z)Yl$>Mufwb241D(Iw%6gfBJ_!@L=r4SN}}7%YACZQ)ix zu=UBUkbE$B?{YwvdaalIMLQs%U+$C3At?{euXOJ|Z}s?>U|xaNCo3U&e{gZFd%ySN zmzQ3=8xR2KTA!?iWHTU}5&rO_X5;T~;5ik4NXR3?KwT0BkO%6i2vbszu_yHyYf_ik z0!-#=X&{{J3A}1CCLy;^`SKEzIUuHj$V*~^i~_kT24&QV6W9MlOEAMXY#<%OaU{Mt z^?hKX2#*mWcPYa}GG-v%YXauuM19OOF(|XfuD$I&p{_`S_Lw;$zhnEYPD2_QRzrf| zERk+7!!=u?Xm%LjH0H0W(Avh+h|4%&+IWgc~T6GKN*ZNQb?93 zrYxT%!=DcH)$YAs_T}P>;sj+@W<&UTK(0qv#jpN9v6`OEl`_$du!>1z3dA~s_p1$qECSyIG=IY#JYIb$@ujvuk-p25e2 ziIv)3i$Y<@bQ}EO;vLhZoHsj8jL%GLDJKO1E5LIi%kFEgcQvj91;M&f6tFhr@~i=m zM!iky0Lw$qY*F*ct_OhIvP3RodPu-kk)n)wOcWWFg#C~j+;hu?Sr`_i;dxG!izTUU zz+OjL`2*A>z5WWdH{g5?Z73|bPlt194CnGdn-3O#@vU|vAlUlkMo3l%3-1I3Tc5lW zl8dKy&kf*kRJ+xKYDm@w=PqT zW>gVXqLMPIaAZ*_3mKV*Bm>B}os8x8*$<Wd`f*NW{^nQPc)}v!rsiZzuhstc(TRxPYqe=gY zlE@DeOX?>uZMHm=_9H0|lQ8^?<#<9x&eheQ50zC7*9GeNaErLY$xh= literal 0 HcmV?d00001 diff --git a/augment_indexer/file_filter.py b/augment_indexer/file_filter.py new file mode 100644 index 0000000..88ab035 --- /dev/null +++ b/augment_indexer/file_filter.py @@ -0,0 +1,123 @@ +""" +File filtering logic for GitHub repository indexing. +""" + +import re +from pathlib import Path +from typing import Optional + +# Keyish pattern regex - matches files that likely contain secrets/keys +KEYISH_PATTERN = re.compile( + r'^(\.git|.*\.pem|.*\.key|.*\.pfx|.*\.p12|.*\.jks|.*\.keystore|.*\.pkcs12|.*\.crt|.*\.cer|id_rsa|id_ed25519|id_ecdsa|id_dsa)$' +) + +# Default max file size in bytes (1 MB) +DEFAULT_MAX_FILE_SIZE = 1024 * 1024 # 1 MB + + +def always_ignore_path(path: str) -> bool: + """ + Check if a path should always be ignored (security measure). + + Args: + path: The file path to check. + + Returns: + True if the path contains ".." and should be ignored. + """ + return ".." in path + + +def is_keyish_path(path: str) -> bool: + """ + Check if a path matches the keyish pattern (secrets/keys). + + Args: + path: The file path to check. + + Returns: + True if the filename matches patterns for secret/key files. + """ + # Extract filename from path + filename = Path(path).name + return bool(KEYISH_PATTERN.match(filename)) + + +def is_valid_file_size(size_bytes: int, max_file_size: int = DEFAULT_MAX_FILE_SIZE) -> bool: + """ + Check if file size is valid for upload. + + Args: + size_bytes: The size of the file in bytes. + max_file_size: Maximum allowed file size in bytes. Defaults to 1 MB. + + Returns: + True if the file size is within the allowed limit. + """ + return size_bytes <= max_file_size + + +def is_valid_utf8(content: bytes) -> bool: + """ + Check if file content is valid UTF-8 (not binary). + + Args: + content: The file content as bytes. + + Returns: + True if the content is valid UTF-8, False if it's binary or invalid. + """ + try: + content.decode("utf-8") + return True + except UnicodeDecodeError: + return False + + +def should_filter_file( + path: str, + content: bytes, + max_file_size: Optional[int] = None, +) -> dict: + """ + Check if a file should be filtered out. + + Returns {"filtered": True, "reason": "..."} if file should be skipped. + Returns {"filtered": False} if file should be included. + + Priority order (from file-filtering.md): + 1. Path validation (contains "..") + 2. File size check + 3. .augmentignore rules (checked by caller) + 4. Keyish patterns + 5. .gitignore rules (checked by caller) + 6. UTF-8 validation + + Args: + path: The file path to check. + content: The file content as bytes. + max_file_size: Maximum allowed file size in bytes. Defaults to DEFAULT_MAX_FILE_SIZE. + + Returns: + A dict with "filtered" (bool) and optionally "reason" (str) keys. + """ + effective_max_size = max_file_size if max_file_size is not None else DEFAULT_MAX_FILE_SIZE + + # 1. Check for ".." in path (security) + if always_ignore_path(path): + return {"filtered": True, "reason": "path_contains_dotdot"} + + # 2. Check file size + if not is_valid_file_size(len(content), effective_max_size): + return {"filtered": True, "reason": f"file_too_large ({len(content)} bytes)"} + + # 3. Check keyish patterns (secrets/keys) + if is_keyish_path(path): + return {"filtered": True, "reason": "keyish_pattern"} + + # 4. Check UTF-8 validity (binary detection) + if not is_valid_utf8(content): + return {"filtered": True, "reason": "binary_file"} + + return {"filtered": False} + diff --git a/augment_indexer/github_client.py b/augment_indexer/github_client.py new file mode 100644 index 0000000..f69bd62 --- /dev/null +++ b/augment_indexer/github_client.py @@ -0,0 +1,307 @@ +""" +GitHub API client for fetching repository data. +""" + +import io +import tarfile + +import pathspec +import requests +from github import Github +from github.GithubException import GithubException + +from .file_filter import should_filter_file +from .models import FileChange + + +class GitHubClient: + """GitHub API client for fetching repository data.""" + + def __init__(self, token: str) -> None: + """ + Initialize the GitHub client with an authentication token. + + Args: + token: GitHub personal access token or GitHub App token. + """ + self._github = Github(token) + self._token = token + + def resolve_ref(self, owner: str, repo: str, ref: str) -> str: + """ + Resolve a ref (like "HEAD", "main", or a commit SHA) to a commit SHA. + + Args: + owner: Repository owner. + repo: Repository name. + ref: Git ref to resolve. + + Returns: + The full 40-character commit SHA. + + Raises: + Exception: If the ref cannot be resolved. + """ + try: + repository = self._github.get_repo(f"{owner}/{repo}") + commit = repository.get_commit(ref) + return commit.sha + except GithubException as error: + raise Exception( + f'Failed to resolve ref "{ref}" for {owner}/{repo}: {error}' + ) from error + + def download_tarball(self, owner: str, repo: str, ref: str) -> dict[str, str]: + """ + Download repository as tarball and extract files. + + Args: + owner: Repository owner. + repo: Repository name. + ref: Git ref to download. + + Returns: + Dictionary mapping file paths to their contents. + """ + print(f"Downloading tarball for {owner}/{repo}@{ref}...") + + repository = self._github.get_repo(f"{owner}/{repo}") + tarball_url = repository.get_archive_link("tarball", ref) + + # Download tarball (10 minute timeout to handle large repositories) + # Include auth header for private repos + headers = {"Authorization": f"Bearer {self._token}"} + response = requests.get(tarball_url, headers=headers, stream=True, timeout=600) + if not response.ok: + raise Exception(f"Failed to download tarball: {response.reason}") + + # Load ignore patterns + augmentignore, gitignore = self._load_ignore_patterns(owner, repo, ref) + + # Track filtering statistics + files: dict[str, str] = {} + total_files = 0 + filtered_files = 0 + filter_reasons: dict[str, int] = {} + + # Extract files from tarball + tarball_data = io.BytesIO(response.content) + with tarfile.open(fileobj=tarball_data, mode="r:gz") as tar: + for member in tar.getmembers(): + # Skip directories and symlinks + if not member.isfile(): + continue + + total_files += 1 + + # Remove the root directory prefix (e.g., "owner-repo-sha/") + path_parts = member.name.split("/") + path_parts.pop(0) # Remove first component + file_path = "/".join(path_parts) + + if not file_path: + continue + + # Read file contents + file_obj = tar.extractfile(member) + if file_obj is None: + continue + content_bytes = file_obj.read() + + # Apply filtering in priority order: + # 1. .augmentignore + if augmentignore and augmentignore.match_file(file_path): + filtered_files += 1 + filter_reasons["augmentignore"] = filter_reasons.get("augmentignore", 0) + 1 + continue + + # 2. Path validation, file size, keyish patterns, UTF-8 validation + filter_result = should_filter_file(path=file_path, content=content_bytes) + + if filter_result["filtered"]: + filtered_files += 1 + reason = filter_result.get("reason", "unknown") + filter_reasons[reason] = filter_reasons.get(reason, 0) + 1 + continue + + # 3. .gitignore (checked last) + if gitignore and gitignore.match_file(file_path): + filtered_files += 1 + filter_reasons["gitignore"] = filter_reasons.get("gitignore", 0) + 1 + continue + + # File passed all filters + try: + contents = content_bytes.decode("utf-8") + files[file_path] = contents + except UnicodeDecodeError: + # This should not happen if is_valid_utf8() is working correctly + filtered_files += 1 + filter_reasons["decode_error"] = filter_reasons.get("decode_error", 0) + 1 + print(f"Warning: File {file_path} passed UTF-8 validation but failed to decode") + + print(f"Extracted {len(files)} files from tarball") + print(f"Filtered {filtered_files} of {total_files} files. Reasons: {filter_reasons}") + return files + + def compare_commits( + self, owner: str, repo: str, base: str, head: str + ) -> dict: + """ + Compare two commits and get file changes. + """ + print(f"Comparing {base}...{head}...") + + repository = self._github.get_repo(f"{owner}/{repo}") + comparison = repository.compare(base, head) + + files: list[FileChange] = [] + + for file in comparison.files: + change = FileChange( + path=file.filename, + status=self._map_github_status(file.status), + previousFilename=file.previous_filename, + ) + + # Download file contents for added/modified files + if change.status in ("added", "modified"): + try: + contents = self.get_file_contents(owner, repo, file.filename, head) + change.contents = contents + except Exception as error: + print(f"Warning: Failed to download {file.filename}: {error}") + + files.append(change) + + return { + "files": files, + "commits": comparison.total_commits, + "totalChanges": len(comparison.files), + } + + def get_file_contents( + self, owner: str, repo: str, path: str, ref: str + ) -> str: + """ + Get file contents at a specific ref. + + Args: + owner: Repository owner. + repo: Repository name. + path: File path within the repository. + ref: Git ref to get contents at. + + Returns: + The file contents as a string. + + Raises: + Exception: If the path is not a file. + """ + repository = self._github.get_repo(f"{owner}/{repo}") + content = repository.get_contents(path, ref) + + if isinstance(content, list): + raise Exception(f"{path} is not a file") + + return content.decoded_content.decode("utf-8") + + def _load_ignore_patterns( + self, owner: str, repo: str, ref: str + ) -> tuple[pathspec.PathSpec | None, pathspec.PathSpec | None]: + """ + Load .gitignore and .augmentignore patterns separately. + + Returns both filters to maintain proper priority order: + .augmentignore → keyish → .gitignore + + Args: + owner: Repository owner. + repo: Repository name. + ref: Git ref to load patterns from. + + Returns: + Tuple of (augmentignore, gitignore) PathSpec objects, or None if not found. + """ + augmentignore: pathspec.PathSpec | None = None + gitignore: pathspec.PathSpec | None = None + + # Try to load .gitignore + try: + gitignore_content = self.get_file_contents(owner, repo, ".gitignore", ref) + gitignore = pathspec.PathSpec.from_lines("gitwildmatch", gitignore_content.splitlines()) + except Exception: + # .gitignore doesn't exist + pass + + # Try to load .augmentignore + try: + augmentignore_content = self.get_file_contents(owner, repo, ".augmentignore", ref) + augmentignore = pathspec.PathSpec.from_lines("gitwildmatch", augmentignore_content.splitlines()) + except Exception: + # .augmentignore doesn't exist + pass + + return augmentignore, gitignore + + def _map_github_status(self, status: str) -> str: + """ + Map GitHub file status to our FileChange status. + + Args: + status: GitHub file status string. + + Returns: + Normalized status string. + """ + status_map = { + "added": "added", + "modified": "modified", + "removed": "removed", + "renamed": "renamed", + } + return status_map.get(status, "modified") + + def ignore_files_changed( + self, owner: str, repo: str, base: str, head: str + ) -> bool: + """ + Check if ignore files changed between commits. + + Args: + owner: Repository owner. + repo: Repository name. + base: Base commit SHA. + head: Head commit SHA. + + Returns: + True if .gitignore or .augmentignore changed, False otherwise. + """ + repository = self._github.get_repo(f"{owner}/{repo}") + comparison = repository.compare(base, head) + + ignore_files = [".gitignore", ".augmentignore"] + return any(file.filename in ignore_files for file in comparison.files) + + def is_force_push( + self, owner: str, repo: str, base: str, head: str + ) -> bool: + """ + Check if the push was a force push. + + Args: + owner: Repository owner. + repo: Repository name. + base: Base commit SHA. + head: Head commit SHA. + + Returns: + True if the push was a force push, False otherwise. + """ + try: + repository = self._github.get_repo(f"{owner}/{repo}") + repository.compare(base, head) + return False + except GithubException: + # If comparison fails, it's likely a force push + return True diff --git a/augment_indexer/index_manager.py b/augment_indexer/index_manager.py new file mode 100644 index 0000000..c2bf48f --- /dev/null +++ b/augment_indexer/index_manager.py @@ -0,0 +1,395 @@ +""" +Index Manager - Core indexing logic +""" + +import json +import tempfile +from pathlib import Path +from typing import Optional + +from auggie_sdk.context import DirectContext, File + +from .github_client import GitHubClient +from .models import FileChange, IndexConfig, IndexResult, IndexState, RepositoryInfo + +DEFAULT_MAX_COMMITS = 100 +DEFAULT_MAX_FILES = 500 + + +class IndexManager: + """Index Manager - Core indexing logic for GitHub repositories.""" + + def __init__( + self, context: DirectContext, config: IndexConfig, state_path: str + ) -> None: + """ + Initialize the IndexManager. + + Args: + context: DirectContext instance for indexing operations. + config: Configuration for the indexing operation. + state_path: Path to the state file for persistence. + """ + self._context = context + self._config = config + self._state_path = state_path + self._github = GitHubClient(config.githubToken) + + def resolve_commit_sha(self) -> None: + """ + Resolve the current commit ref to an actual commit SHA. + + This handles cases where GITHUB_SHA might be "HEAD" or a branch name. + Updates the config.currentCommit with the resolved SHA. + """ + resolved_sha = self._github.resolve_ref( + self._config.owner, self._config.repo, self._config.currentCommit + ) + self._config.currentCommit = resolved_sha + + def _load_state(self) -> Optional[IndexState]: + """ + Load index state from file system. + + EXTENDING TO OTHER STORAGE BACKENDS: + Replace this method to load state from your preferred storage: + - Redis: Use redis-py client to GET the state JSON + - S3: Use boto3 to get_object from S3 bucket + - Database: Query your database for the state record + + Example for Redis: + import redis + r = redis.Redis.from_url(redis_url) + data = r.get(state_key) + return json.loads(data) if data else None + + Example for S3: + import boto3 + s3 = boto3.client('s3') + response = s3.get_object(Bucket=bucket, Key=key) + data = response['Body'].read().decode('utf-8') + return json.loads(data) + + Returns: + The loaded IndexState or None if the file doesn't exist. + """ + try: + with open(self._state_path, "r", encoding="utf-8") as f: + return json.load(f) + except FileNotFoundError: + return None + + def _save_state(self, state: IndexState) -> None: + """ + Save index state to file system. + + EXTENDING TO OTHER STORAGE BACKENDS: + Replace this method to save state to your preferred storage: + - Redis: Use redis-py client to SET the state JSON + - S3: Use boto3 to put_object to S3 bucket + - Database: Insert or update the state record in your database + + Example for Redis: + import redis + r = redis.Redis.from_url(redis_url) + r.set(state_key, json.dumps(state)) + + Example for S3: + import boto3 + s3 = boto3.client('s3') + s3.put_object( + Bucket=bucket, + Key=key, + Body=json.dumps(state), + ContentType='application/json' + ) + + Note: The state is just a JSON object (IndexState type) that can be + serialized and stored anywhere. For distributed systems, consider using + Redis or a database for shared state across multiple workers. + + Args: + state: The IndexState to save. + """ + # Ensure directory exists + Path(self._state_path).parent.mkdir(parents=True, exist_ok=True) + + # Write state to file + with open(self._state_path, "w", encoding="utf-8") as f: + json.dump(state, f, indent=2) + + def index(self) -> IndexResult: + """ + Main indexing entry point. + + Returns: + IndexResult with success status and indexing details. + """ + print( + f"Starting index for {self._config.owner}/{self._config.repo}" + f"@{self._config.branch}" + ) + + try: + # Load previous state + previous_state = self._load_state() + + # If we have previous state, we'll need to create a new context with the imported state + # For now, we'll handle this in the incremental update logic + + # Determine if we need full re-index + should_reindex, reason = self._should_full_reindex(previous_state) + + if should_reindex: + return self._full_reindex(reason) + + # Perform incremental update + # previous_state is guaranteed to be non-null here + if not previous_state: + raise RuntimeError("previous_state should not be None at this point") + return self._incremental_update(previous_state) + except Exception as error: + print(f"Indexing failed: {error}") + return IndexResult( + success=False, + type="full", + filesIndexed=0, + filesDeleted=0, + checkpointId="", + commitSha=self._config.currentCommit, + error=str(error), + ) + + def _should_full_reindex( + self, previous_state: Optional[IndexState] + ) -> tuple[bool, Optional[str]]: + """ + Determine if full re-index is needed. + + Args: + previous_state: The previous index state, or None if first run. + + Returns: + Tuple of (should_reindex, reason). + """ + # No previous state - first run + if not previous_state: + return (True, "first_run") + + # Different repository + if ( + previous_state["repository"]["owner"] != self._config.owner + or previous_state["repository"]["name"] != self._config.repo + ): + return (True, "different_repository") + + # Same commit - no changes + if previous_state["lastCommitSha"] == self._config.currentCommit: + print("No changes detected") + return (False, None) + + # Check for force push + is_force_push = self._github.is_force_push( + self._config.owner, + self._config.repo, + previous_state["lastCommitSha"], + self._config.currentCommit, + ) + + if is_force_push: + return (True, "force_push") + + # Get comparison + comparison = self._github.compare_commits( + self._config.owner, + self._config.repo, + previous_state["lastCommitSha"], + self._config.currentCommit, + ) + + # Too many commits + max_commits = self._config.maxCommits or DEFAULT_MAX_COMMITS + if comparison["commits"] > max_commits: + return ( + True, + f"too_many_commits ({comparison['commits']} > {max_commits})", + ) + + # Too many file changes + max_files = self._config.maxFiles or DEFAULT_MAX_FILES + if comparison["totalChanges"] > max_files: + return ( + True, + f"too_many_files ({comparison['totalChanges']} > {max_files})", + ) + + # Check if ignore files changed + ignore_changed = self._github.ignore_files_changed( + self._config.owner, + self._config.repo, + previous_state["lastCommitSha"], + self._config.currentCommit, + ) + + if ignore_changed: + return (True, "ignore_files_changed") + + return (False, None) + + def _full_reindex(self, reason: Optional[str]) -> IndexResult: + """ + Perform full repository re-index. + + Args: + reason: The reason for the full re-index. + + Returns: + IndexResult with the result of the full re-index. + """ + print(f"Performing full re-index (reason: {reason or 'unknown'})") + + # Download entire repository as tarball + files = self._github.download_tarball( + self._config.owner, self._config.repo, self._config.currentCommit + ) + + # Add all files to index + files_to_index = [ + File(path=path, contents=contents) for path, contents in files.items() + ] + + print(f"Adding {len(files_to_index)} files to index...") + self._context.add_to_index(files_to_index) + + # Export DirectContext state + context_state = self._context.export() + context_state_dict = context_state.to_dict() + + new_state: IndexState = { + "contextState": context_state_dict, + "lastCommitSha": self._config.currentCommit, + "repository": RepositoryInfo( + owner=self._config.owner, + name=self._config.repo, + ), + } + + # Save state + self._save_state(new_state) + + return IndexResult( + success=True, + type="full", + filesIndexed=len(files_to_index), + filesDeleted=0, + checkpointId=context_state.checkpoint_id or "", + commitSha=self._config.currentCommit, + reindexReason=reason, + ) + + def _incremental_update(self, previous_state: IndexState) -> IndexResult: + """ + Perform incremental update. + + Args: + previous_state: The previous index state. + + Returns: + IndexResult with the result of the incremental update. + """ + print("Performing incremental update...") + + # Create a temporary file with the previous context state + # Use delete=False because Windows can't reopen a NamedTemporaryFile while it's open + temp_file = tempfile.NamedTemporaryFile( + mode="w", suffix=".json", prefix="github-indexer-incremental-", delete=False + ) + temp_path = Path(temp_file.name) + try: + json.dump(previous_state["contextState"], temp_file, indent=2) + temp_file.close() # Close before reading on Windows + + # Create a new context from the previous state + self._context = DirectContext.import_from_file( + str(temp_path), + api_key=self._config.apiToken, + api_url=self._config.apiUrl, + ) + finally: + temp_path.unlink(missing_ok=True) + + # Get file changes + comparison = self._github.compare_commits( + self._config.owner, + self._config.repo, + previous_state["lastCommitSha"], + self._config.currentCommit, + ) + + # Process changes + files_to_add, files_to_delete = self._process_file_changes(comparison["files"]) + + print(f"Adding {len(files_to_add)} files, deleting {len(files_to_delete)} files") + + # Update index + if files_to_add: + self._context.add_to_index(files_to_add) + + if files_to_delete: + self._context.remove_from_index(files_to_delete) + + # Export DirectContext state + context_state = self._context.export() + context_state_dict = context_state.to_dict() + + new_state: IndexState = { + "contextState": context_state_dict, + "lastCommitSha": self._config.currentCommit, + "repository": previous_state["repository"], + } + + # Save state + self._save_state(new_state) + + return IndexResult( + success=True, + type="incremental", + filesIndexed=len(files_to_add), + filesDeleted=len(files_to_delete), + checkpointId=context_state.checkpoint_id or "", + commitSha=self._config.currentCommit, + ) + + def _process_file_changes( + self, changes: list[FileChange] + ) -> tuple[list[File], list[str]]: + """ + Process file changes and categorize them for indexing. + + Args: + changes: List of file changes from the comparison. + + Returns: + Tuple of (files_to_add, files_to_delete). + """ + files_to_add: list[File] = [] + files_to_delete: list[str] = [] + + for change in changes: + if change.status in ("added", "modified"): + if change.contents: + files_to_add.append( + File(path=change.path, contents=change.contents) + ) + elif change.status == "removed": + files_to_delete.append(change.path) + elif change.status == "renamed": + if change.previousFilename: + files_to_delete.append(change.previousFilename) + if change.contents: + files_to_add.append( + File(path=change.path, contents=change.contents) + ) + + return files_to_add, files_to_delete + diff --git a/augment_indexer/main.py b/augment_indexer/main.py new file mode 100644 index 0000000..fd10065 --- /dev/null +++ b/augment_indexer/main.py @@ -0,0 +1,167 @@ +#!/usr/bin/env python3 +""" +Main entry point for GitHub Action Indexer + +Usage: + cd examples/python-sdk/context + python -m github_action_indexer index +""" + +import os +import re +import sys + +from auggie_sdk.context import DirectContext + +from .index_manager import IndexManager +from .models import IndexConfig + + +def get_api_credentials() -> tuple[str, str]: + """Get API credentials from environment variables.""" + api_token = os.environ.get("AUGMENT_API_TOKEN") + if not api_token: + raise ValueError("AUGMENT_API_TOKEN environment variable is required") + + api_url = os.environ.get("AUGMENT_API_URL") + if not api_url: + raise ValueError( + "AUGMENT_API_URL environment variable is required. Please set it to your " + "tenant-specific URL (e.g., 'https://your-tenant.api.augmentcode.com/')" + ) + + return api_token, api_url + + +def parse_repository_info() -> tuple[str, str, str, str]: + """ + Parse repository information from environment variables. + Returns (owner, repo, branch, current_commit). + """ + repository = os.environ.get("GITHUB_REPOSITORY", "") + parts = repository.split("/") + + if len(parts) != 2 or not parts[0] or not parts[1]: + raise ValueError('GITHUB_REPOSITORY must be in format "owner/repo"') + + owner, repo = parts + + # Extract branch name from GitHub ref + github_ref = os.environ.get("GITHUB_REF", "") + github_ref_name = os.environ.get("GITHUB_REF_NAME", "") + + if github_ref.startswith("refs/heads/"): + branch = github_ref_name + elif github_ref.startswith("refs/tags/"): + branch = f"tag/{github_ref_name}" + elif github_ref_name: + branch = github_ref_name + else: + branch = os.environ.get("BRANCH", "main") + + current_commit = os.environ.get("GITHUB_SHA", "") + if not current_commit: + raise ValueError("GITHUB_SHA environment variable is required") + + return owner, repo, branch, current_commit + + +def load_config() -> IndexConfig: + """Load configuration from environment variables.""" + github_token = os.environ.get("GITHUB_TOKEN") + if not github_token: + raise ValueError("GITHUB_TOKEN environment variable is required") + + api_token, api_url = get_api_credentials() + owner, repo, branch, current_commit = parse_repository_info() + + max_commits = os.environ.get("MAX_COMMITS") + max_files = os.environ.get("MAX_FILES") + + return IndexConfig( + apiToken=api_token, + apiUrl=api_url, + githubToken=github_token, + owner=owner, + repo=repo, + branch=branch, + currentCommit=current_commit, + maxCommits=int(max_commits) if max_commits else None, + maxFiles=int(max_files) if max_files else None, + ) + + +def get_state_path(branch: str) -> str: + """Get the state file path for the current branch.""" + sanitized_branch = re.sub(r"[^a-zA-Z0-9\-_]", "-", branch) + return os.environ.get( + "STATE_PATH", f".augment-index-state/{sanitized_branch}/state.json" + ) + + +def main() -> None: + """Main function.""" + print("GitHub Action Indexer - Starting...") + + try: + # Load configuration + config = load_config() + state_path = get_state_path(config.branch) + + print(f"Repository: {config.owner}/{config.repo}") + print(f"Branch: {config.branch}") + print(f"Commit ref: {config.currentCommit}") + print(f"State path: {state_path}") + + # Create DirectContext + context = DirectContext.create(api_key=config.apiToken, api_url=config.apiUrl) + + # Create index manager and resolve commit SHA + manager = IndexManager(context, config, state_path) + manager.resolve_commit_sha() + + print(f"Resolved commit SHA: {config.currentCommit}") + + # Perform indexing + result = manager.index() + + # Print results + print("\n=== Indexing Results ===") + print(f"Success: {result.success}") + print(f"Type: {result.type}") + print(f"Files Indexed: {result.filesIndexed}") + print(f"Files Deleted: {result.filesDeleted}") + print(f"Checkpoint ID: {result.checkpointId}") + print(f"Commit SHA: {result.commitSha}") + + if result.reindexReason: + print(f"Re-index Reason: {result.reindexReason}") + + if result.error: + print(f"Error: {result.error}", file=sys.stderr) + sys.exit(1) + + # Set GitHub Actions output + github_output = os.environ.get("GITHUB_OUTPUT") + if github_output: + output_lines = [ + f"success={result.success}", + f"type={result.type}", + f"files_indexed={result.filesIndexed}", + f"files_deleted={result.filesDeleted}", + f"checkpoint_id={result.checkpointId}", + f"commit_sha={result.commitSha}", + ] + with open(github_output, "a") as f: + f.write("\n".join(output_lines) + "\n") + + print("\nIndexing completed successfully!") + + except Exception as error: + print(f"Fatal error: {error}", file=sys.stderr) + sys.exit(1) + + +if __name__ == "__main__": + main() + diff --git a/augment_indexer/models.py b/augment_indexer/models.py new file mode 100644 index 0000000..8b3dfc0 --- /dev/null +++ b/augment_indexer/models.py @@ -0,0 +1,131 @@ +""" +Types for the GitHub Action Indexer + +This module defines the data types used by the GitHub Action Indexer +for tracking index state, file changes, configuration, and results. +""" + +from dataclasses import dataclass +from typing import Literal, Optional + +from typing_extensions import TypedDict + +from auggie_sdk.context.models import DirectContextState + + +class RepositoryInfo(TypedDict): + """Repository information for index state.""" + + owner: str # Repository owner + name: str # Repository name + + +class IndexState(TypedDict): + """ + Persistent state for the GitHub Action Indexer. + + This state is stored between indexing runs to enable incremental indexing. + """ + + contextState: DirectContextState + """DirectContext state (checkpoint, blobs, etc.)""" + + lastCommitSha: str + """Last indexed commit SHA (must be a full 40-character SHA, not a ref like 'HEAD')""" + + repository: RepositoryInfo + """Repository information - used to verify we're indexing the same repository""" + + +@dataclass +class FileChange: + """ + Represents a file change detected between commits. + + Used to track what files need to be indexed or removed from the index. + """ + + path: str + """File path""" + + status: Literal["added", "modified", "removed", "renamed"] + """Change status: added, modified, removed, renamed""" + + previousFilename: Optional[str] = None + """Previous filename (for renames)""" + + contents: Optional[str] = None + """File contents (for added/modified files)""" + + oldBlobName: Optional[str] = None + """Blob name from previous index (for modified/removed files)""" + + +@dataclass +class IndexConfig: + """ + Configuration for the GitHub Action Indexer. + + Contains all the settings needed to perform indexing of a GitHub repository. + """ + + apiToken: str + """Augment API token""" + + apiUrl: str + """Augment API URL (provided via AUGMENT_API_URL env var)""" + + githubToken: str + """GitHub token""" + + owner: str + """Repository owner""" + + repo: str + """Repository name""" + + branch: str + """Branch to index""" + + currentCommit: str + """Current commit SHA""" + + maxCommits: Optional[int] = None + """Maximum commits before full re-index""" + + maxFiles: Optional[int] = None + """Maximum file changes before full re-index""" + + +@dataclass +class IndexResult: + """ + Result from an indexing operation. + + Contains information about what was indexed and whether it was successful. + """ + + success: bool + """Whether indexing was successful""" + + type: Literal["full", "incremental", "no-changes"] + """Type of indexing performed""" + + filesIndexed: int + """Number of files indexed""" + + filesDeleted: int + """Number of files deleted""" + + checkpointId: str + """New checkpoint ID""" + + commitSha: str + """Commit SHA that was indexed""" + + error: Optional[str] = None + """Error message if failed""" + + reindexReason: Optional[str] = None + """Reason for full re-index (if applicable)""" + diff --git a/augment_indexer/requirements.txt b/augment_indexer/requirements.txt new file mode 100644 index 0000000..5552b4e --- /dev/null +++ b/augment_indexer/requirements.txt @@ -0,0 +1,14 @@ +# GitHub Action Indexer dependencies + +# Augment SDK for indexing and search +auggie-sdk>=0.1.0 + +# GitHub API client +PyGithub>=2.1.0 + +# HTTP requests (for tarball download) +requests>=2.25.0 + +# Gitignore-style pattern matching +pathspec>=0.11.0 + diff --git a/augment_indexer/search.py b/augment_indexer/search.py new file mode 100644 index 0000000..fdac426 --- /dev/null +++ b/augment_indexer/search.py @@ -0,0 +1,132 @@ +#!/usr/bin/env python3 +""" +CLI tool to search the indexed repository + +Usage: + cd examples/python-sdk/context + python -m github_action_indexer search "your search query" + python -m github_action_indexer search "your search query" --max-chars 5000 +""" + +import argparse +import json +import os +import re +import sys +import tempfile +from pathlib import Path +from typing import Optional + +from auggie_sdk.context import DirectContext + +from .models import IndexState + + +def get_state_path() -> str: + """Get the state file path for the current branch.""" + branch = os.environ.get("BRANCH", "main") + sanitized_branch = re.sub(r"[^a-zA-Z0-9\-_]", "-", branch) + return os.environ.get( + "STATE_PATH", f".augment-index-state/{sanitized_branch}/state.json" + ) + + +def load_state(state_path: str) -> Optional[IndexState]: + """Load index state from file system.""" + try: + with open(state_path, "r") as f: + data = f.read() + return json.loads(data) + except FileNotFoundError: + return None + + +def main() -> None: + """Main search function.""" + # Parse command line arguments + parser = argparse.ArgumentParser( + description="Search the indexed repository", + epilog='Example: python search.py "authentication functions"', + ) + parser.add_argument("query", help="Search query") + parser.add_argument( + "--max-chars", + type=int, + help="Maximum number of characters in output", + dest="max_chars", + ) + args = parser.parse_args() + + # Get API credentials + api_token = os.environ.get("AUGMENT_API_TOKEN") + if not api_token: + print("Error: AUGMENT_API_TOKEN environment variable is required", file=sys.stderr) + sys.exit(1) + + api_url = os.environ.get("AUGMENT_API_URL") + if not api_url: + print( + "Error: AUGMENT_API_URL environment variable is required. Please set it to your " + "tenant-specific URL (e.g., 'https://your-tenant.api.augmentcode.com/')", + file=sys.stderr, + ) + sys.exit(1) + + print(f'Searching for: "{args.query}"') + if args.max_chars is not None: + print(f"Limiting results to max {args.max_chars} characters\n") + else: + print() + + try: + # Load the index state first + state_path = get_state_path() + print(f"Loading index state from: {state_path}") + state = load_state(state_path) + + if not state: + print("Error: No index state found. Run indexing first.", file=sys.stderr) + print(" python -m github_action_indexer index", file=sys.stderr) + sys.exit(1) + + # Create a temporary file with the context state for import + # Use delete=False because Windows can't reopen a NamedTemporaryFile while it's open + temp_file = tempfile.NamedTemporaryFile( + mode="w", suffix=".json", prefix="github-indexer-state-", delete=False + ) + temp_path = Path(temp_file.name) + try: + json.dump(state["contextState"], temp_file, indent=2) + temp_file.close() # Close before reading on Windows + + # Import state using DirectContext.import_from_file + context = DirectContext.import_from_file( + str(temp_path), api_key=api_token, api_url=api_url + ) + finally: + temp_path.unlink(missing_ok=True) + + file_count = len(state["contextState"].get("blobs", [])) + + print(f"Loaded index: {file_count} files indexed") + print(f"Repository: {state['repository']['owner']}/{state['repository']['name']}") + print(f"Last indexed commit: {state['lastCommitSha']}\n") + + # Perform search with optional character limit + results = context.search(args.query, max_output_length=args.max_chars) + + if not results or results.strip() == "": + print("No results found.") + return + + print("Search results:\n") + print(results) + + except Exception as error: + print(f"Search failed: {error}", file=sys.stderr) + sys.exit(1) + + +if __name__ == "__main__": + main() +