From 3b74b1fad39ad6f2a1e900bc1f31f11f6e10f128 Mon Sep 17 00:00:00 2001 From: Sasha Varlamov Date: Fri, 13 Feb 2026 11:29:56 -0500 Subject: [PATCH 1/3] Optimize rebase/squash rewrite paths and harden regression coverage --- .../git/benchmark_nasty_squashes.sh | 505 ++++++++++++++ src/authorship/post_commit.rs | 388 ++++++++--- src/authorship/rebase_authorship.rs | 373 +++++++++-- src/authorship/virtual_attribution.rs | 623 +++++++++++++++--- src/commands/blame.rs | 315 ++++++--- src/commands/checkpoint.rs | 138 ++++ src/commands/hooks/commit_hooks.rs | 21 + src/git/authorship_traversal.rs | 81 ++- src/git/repo_storage.rs | 115 ++++ src/git/repository.rs | 289 ++++++-- tests/squash_merge.rs | 126 ++++ 11 files changed, 2589 insertions(+), 385 deletions(-) create mode 100755 scripts/benchmarks/git/benchmark_nasty_squashes.sh diff --git a/scripts/benchmarks/git/benchmark_nasty_squashes.sh b/scripts/benchmarks/git/benchmark_nasty_squashes.sh new file mode 100755 index 000000000..5d47eb390 --- /dev/null +++ b/scripts/benchmarks/git/benchmark_nasty_squashes.sh @@ -0,0 +1,505 @@ +#!/usr/bin/env bash +set -euo pipefail + +usage() { + cat <<'EOF' +Benchmark git-ai squash performance against plain git on heavy synthetic commit stacks. + +Usage: + benchmark_nasty_squashes.sh [options] + +Options: + --repo-url OSS repo to clone (default: https://github.com/python/cpython.git) + --work-root Working directory (default: /tmp/git-ai-nasty-squash-) + --feature-commits Number of AI feature commits (default: 180) + --main-commits Number of upstream target-branch commits (default: 60) + --files Number of generated feature files (default: 30) + --lines-per-file Lines per generated file (default: 1500) + --burst-every Every Nth feature commit rewrites all generated files (default: 20) + --git-bin Wrapped git binary (default: wrapper next to git-ai, else PATH git) + --plain-git-bin Plain git binary (default: /usr/bin/git, else first non-wrapper git in PATH) + --git-ai-bin git-ai binary (default: PATH git-ai) + --skip-clone Reuse existing clone in /repo + -h, --help Show help + +Outputs: + - Logs: /logs/*.log + - Summary: /summary.txt + - Results TSV: /results.tsv +EOF +} + +REPO_URL="https://github.com/python/cpython.git" +WORK_ROOT="" +FEATURE_COMMITS=180 +MAIN_COMMITS=60 +FILES=30 +LINES_PER_FILE=1500 +BURST_EVERY=20 +SKIP_CLONE=0 +GIT_BIN="" +PLAIN_GIT_BIN="" +GIT_AI_BIN="" + +while [[ $# -gt 0 ]]; do + case "$1" in + --repo-url) REPO_URL="$2"; shift 2 ;; + --work-root) WORK_ROOT="$2"; shift 2 ;; + --feature-commits) FEATURE_COMMITS="$2"; shift 2 ;; + --main-commits) MAIN_COMMITS="$2"; shift 2 ;; + --files) FILES="$2"; shift 2 ;; + --lines-per-file) LINES_PER_FILE="$2"; shift 2 ;; + --burst-every) BURST_EVERY="$2"; shift 2 ;; + --git-bin) GIT_BIN="$2"; shift 2 ;; + --plain-git-bin) PLAIN_GIT_BIN="$2"; shift 2 ;; + --git-ai-bin) GIT_AI_BIN="$2"; shift 2 ;; + --skip-clone) SKIP_CLONE=1; shift ;; + -h|--help) usage; exit 0 ;; + *) echo "Unknown argument: $1"; usage; exit 1 ;; + esac +done + +if [[ -z "$GIT_AI_BIN" ]]; then + GIT_AI_BIN="$(command -v git-ai || true)" +fi +if [[ -z "$GIT_AI_BIN" ]]; then + echo "error: git-ai not found in PATH" >&2 + exit 1 +fi + +if [[ -z "$GIT_BIN" ]]; then + CANDIDATE_WRAP_GIT="$(dirname "$GIT_AI_BIN")/git" + if [[ -x "$CANDIDATE_WRAP_GIT" ]]; then + GIT_BIN="$CANDIDATE_WRAP_GIT" + else + GIT_BIN="$(command -v git)" + fi +fi + +detect_plain_git_bin() { + local wrapped="$1" + + if [[ -x /usr/bin/git && "/usr/bin/git" != "$wrapped" ]]; then + echo "/usr/bin/git" + return + fi + + local candidate + while IFS= read -r candidate; do + if [[ -n "$candidate" && "$candidate" != "$wrapped" ]]; then + echo "$candidate" + return + fi + done < <(which -a git 2>/dev/null | awk '!seen[$0]++') + + echo "$wrapped" +} + +if [[ -z "$PLAIN_GIT_BIN" ]]; then + PLAIN_GIT_BIN="$(detect_plain_git_bin "$GIT_BIN")" +fi + +if [[ -z "$WORK_ROOT" ]]; then + WORK_ROOT="${TMPDIR:-/tmp}/git-ai-nasty-squash-$(date +%Y%m%d-%H%M%S)" +fi + +REPO_DIR="$WORK_ROOT/repo" +LOG_DIR="$WORK_ROOT/logs" +SUMMARY_FILE="$WORK_ROOT/summary.txt" +RESULTS_TSV="$WORK_ROOT/results.tsv" +mkdir -p "$WORK_ROOT" "$LOG_DIR" + +now_ns() { + python3 - <<'PY' +import time +print(time.time_ns()) +PY +} + +seconds_from_ns_delta() { + local start_ns="$1" + local end_ns="$2" + python3 - "$start_ns" "$end_ns" <<'PY' +import sys +s = int(sys.argv[1]) +e = int(sys.argv[2]) +print(f"{(e - s) / 1_000_000_000:.3f}") +PY +} + +strip_ansi_file() { + local src="$1" + local dst="$2" + perl -pe 's/\e\[[0-9;]*[A-Za-z]//g' "$src" > "$dst" +} + +extract_perf_field() { + local log_file="$1" + local command="$2" + local field="$3" + python3 - "$log_file" "$command" "$field" <<'PY' +import json +import sys + +path, cmd, field = sys.argv[1], sys.argv[2], sys.argv[3] +last = None +try: + with open(path, "r", encoding="utf-8", errors="replace") as f: + for line in f: + if "[git-ai (perf-json)]" not in line: + continue + start = line.find("{") + if start < 0: + continue + payload = line[start:].strip() + try: + obj = json.loads(payload) + except Exception: + continue + if obj.get("command") == cmd: + last = obj +except FileNotFoundError: + pass + +if last is not None and field in last: + print(last[field]) +PY +} + +g_plain() { + "$PLAIN_GIT_BIN" -C "$REPO_DIR" "$@" +} + +g_ai() { + GIT_AI_DEBUG=0 GIT_AI_DEBUG_PERFORMANCE=0 "$GIT_BIN" -C "$REPO_DIR" "$@" +} + +generate_file() { + local path="$1" + local seed="$2" + local lines="$3" + python3 - "$path" "$seed" "$lines" <<'PY' +import os +import sys + +path = sys.argv[1] +seed = int(sys.argv[2]) +lines = int(sys.argv[3]) + +os.makedirs(os.path.dirname(path), exist_ok=True) +with open(path, "w", encoding="utf-8") as f: + for i in range(1, lines + 1): + payload = (seed * 1315423911 + i * 2654435761) & 0xFFFFFFFF + f.write(f"seed={seed:08d} line={i:06d} payload={payload:08x}\n") +PY +} + +run_ai_checkpoint() { + ( + cd "$REPO_DIR" + GIT_AI_DEBUG=0 GIT_AI_DEBUG_PERFORMANCE=0 "$GIT_AI_BIN" checkpoint mock_ai >/dev/null + ) +} + +ensure_clean_state() { + g_ai rebase --abort >/dev/null 2>&1 || true + g_ai merge --abort >/dev/null 2>&1 || true + g_ai am --abort >/dev/null 2>&1 || true + g_ai cherry-pick --abort >/dev/null 2>&1 || true +} + +run_mode_squash() { + local scenario="$1" + local mode="$2" + local source_branch="$3" + local target_branch="$4" + + local run_branch="bench-${scenario}-${mode}" + local merge_log="$LOG_DIR/${scenario}-${mode}-merge.log" + local merge_clean="$LOG_DIR/${scenario}-${mode}-merge.clean.log" + local commit_log="$LOG_DIR/${scenario}-${mode}-commit.log" + local commit_clean="$LOG_DIR/${scenario}-${mode}-commit.clean.log" + + local git_cmd + if [[ "$mode" == "plain" ]]; then + git_cmd="$PLAIN_GIT_BIN" + else + git_cmd="$GIT_BIN" + fi + + ensure_clean_state + "$git_cmd" -C "$REPO_DIR" checkout -B "$run_branch" "$target_branch" >/dev/null + + local merge_status="ok" + local commit_status="ok" + + local merge_start_ns merge_end_ns merge_s + merge_start_ns="$(now_ns)" + if [[ "$mode" == "plain" ]]; then + if "$git_cmd" -C "$REPO_DIR" merge --squash "$source_branch" >"$merge_log" 2>&1; then + merge_status="ok" + else + merge_status="fail" + fi + else + if GIT_AI_DEBUG=1 GIT_AI_DEBUG_PERFORMANCE=2 "$git_cmd" -C "$REPO_DIR" merge --squash "$source_branch" >"$merge_log" 2>&1; then + merge_status="ok" + else + merge_status="fail" + fi + fi + merge_end_ns="$(now_ns)" + merge_s="$(seconds_from_ns_delta "$merge_start_ns" "$merge_end_ns")" + + local commit_start_ns commit_end_ns commit_s + commit_start_ns="$(now_ns)" + if [[ "$merge_status" == "ok" ]]; then + if [[ "$mode" == "plain" ]]; then + if "$git_cmd" -C "$REPO_DIR" commit -m "bench(squash): $scenario $mode" >"$commit_log" 2>&1; then + commit_status="ok" + else + commit_status="fail" + fi + else + if GIT_AI_DEBUG=1 GIT_AI_DEBUG_PERFORMANCE=2 "$git_cmd" -C "$REPO_DIR" commit -m "bench(squash): $scenario $mode" >"$commit_log" 2>&1; then + commit_status="ok" + else + commit_status="fail" + fi + fi + else + commit_status="skip" + : >"$commit_log" + fi + commit_end_ns="$(now_ns)" + commit_s="$(seconds_from_ns_delta "$commit_start_ns" "$commit_end_ns")" + + strip_ansi_file "$merge_log" "$merge_clean" + strip_ansi_file "$commit_log" "$commit_clean" + + local overall_status="ok" + if [[ "$merge_status" != "ok" || "$commit_status" != "ok" ]]; then + overall_status="fail" + ensure_clean_state + fi + + local head_sha note_state + head_sha="$("$git_cmd" -C "$REPO_DIR" rev-parse HEAD 2>/dev/null || true)" + if [[ "$mode" == "ai" && -n "$head_sha" ]] && g_ai notes --ref=ai show "$head_sha" >/dev/null 2>&1; then + note_state="yes" + else + note_state="no" + fi + + local merge_git_ms="-" merge_pre_ms="-" merge_post_ms="-" + local commit_git_ms="-" commit_pre_ms="-" commit_post_ms="-" + if [[ "$mode" == "ai" ]]; then + merge_git_ms="$(extract_perf_field "$merge_clean" "merge" "git_duration_ms")" + merge_pre_ms="$(extract_perf_field "$merge_clean" "merge" "pre_command_duration_ms")" + merge_post_ms="$(extract_perf_field "$merge_clean" "merge" "post_command_duration_ms")" + commit_git_ms="$(extract_perf_field "$commit_clean" "commit" "git_duration_ms")" + commit_pre_ms="$(extract_perf_field "$commit_clean" "commit" "pre_command_duration_ms")" + commit_post_ms="$(extract_perf_field "$commit_clean" "commit" "post_command_duration_ms")" + fi + + local total_s + total_s="$(python3 - "$merge_s" "$commit_s" <<'PY' +import sys +print(f"{float(sys.argv[1]) + float(sys.argv[2]):.3f}") +PY +)" + + printf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" \ + "$scenario" "$mode" "$overall_status" "$merge_s" "$commit_s" "$total_s" \ + "$merge_git_ms" "$merge_pre_ms" "$merge_post_ms" \ + "$commit_git_ms" "$commit_pre_ms" "$commit_post_ms" \ + "$note_state" "$source_branch" "$target_branch" >>"$RESULTS_TSV" + + { + echo "scenario: $scenario" + echo "mode: $mode" + echo "status: $overall_status" + echo "source_branch: $source_branch" + echo "target_branch: $target_branch" + echo "result_branch: $run_branch" + echo "head_sha: ${head_sha:-}" + echo "merge_seconds: $merge_s" + echo "commit_seconds: $commit_s" + echo "total_seconds: $total_s" + if [[ "$mode" == "ai" ]]; then + echo "merge_git_duration_ms: ${merge_git_ms:-}" + echo "merge_pre_duration_ms: ${merge_pre_ms:-}" + echo "merge_post_duration_ms: ${merge_post_ms:-}" + echo "commit_git_duration_ms: ${commit_git_ms:-}" + echo "commit_pre_duration_ms: ${commit_pre_ms:-}" + echo "commit_post_duration_ms: ${commit_post_ms:-}" + fi + echo "head_has_ai_note: $note_state" + echo "merge_log: $merge_log" + echo "commit_log: $commit_log" + echo + } >>"$SUMMARY_FILE" + + echo "scenario=$scenario mode=$mode status=$overall_status merge=${merge_s}s commit=${commit_s}s total=${total_s}s note=$note_state" + if [[ "$mode" == "ai" ]]; then + echo " merge breakdown ms: pre=${merge_pre_ms:-?} git=${merge_git_ms:-?} post=${merge_post_ms:-?}" + echo " commit breakdown ms: pre=${commit_pre_ms:-?} git=${commit_git_ms:-?} post=${commit_post_ms:-?}" + fi +} + +echo "=== git-ai nasty squash benchmark ===" +echo "repo_url=$REPO_URL" +echo "work_root=$WORK_ROOT" +echo "repo_dir=$REPO_DIR" +echo "plain_git_bin=$PLAIN_GIT_BIN" +echo "git_bin=$GIT_BIN" +echo "git_ai_bin=$GIT_AI_BIN" +echo "feature_commits=$FEATURE_COMMITS main_commits=$MAIN_COMMITS files=$FILES lines_per_file=$LINES_PER_FILE burst_every=$BURST_EVERY" + +if [[ "$SKIP_CLONE" -eq 0 ]]; then + rm -rf "$REPO_DIR" + echo "Cloning repo..." + "$PLAIN_GIT_BIN" clone --depth 1 "$REPO_URL" "$REPO_DIR" >/dev/null +fi + +if [[ ! -d "$REPO_DIR/.git" ]]; then + echo "error: repo missing at $REPO_DIR" >&2 + exit 1 +fi + +DEFAULT_BRANCH="$("$PLAIN_GIT_BIN" -C "$REPO_DIR" rev-parse --abbrev-ref origin/HEAD 2>/dev/null | sed 's|^origin/||')" +if [[ -z "$DEFAULT_BRANCH" || "$DEFAULT_BRANCH" == "HEAD" ]]; then + if "$PLAIN_GIT_BIN" -C "$REPO_DIR" rev-parse --verify origin/main >/dev/null 2>&1; then + DEFAULT_BRANCH="main" + elif "$PLAIN_GIT_BIN" -C "$REPO_DIR" rev-parse --verify origin/master >/dev/null 2>&1; then + DEFAULT_BRANCH="master" + else + DEFAULT_BRANCH="$("$PLAIN_GIT_BIN" -C "$REPO_DIR" rev-parse --abbrev-ref HEAD)" + fi +fi +echo "default_branch=$DEFAULT_BRANCH" + +g_plain config user.name "git-ai bench" +g_plain config user.email "bench@git-ai.local" +g_plain config commit.gpgsign false +g_plain config gc.auto 0 + +g_plain checkout -B bench-main-base "origin/$DEFAULT_BRANCH" >/dev/null + +echo "Seeding generated files..." +for f in $(seq 1 "$FILES"); do + generate_file "$REPO_DIR/bench/generated/file_${f}.txt" "$((1000 + f))" "$LINES_PER_FILE" +done +g_plain add -A bench/generated +g_plain commit -m "bench: seed generated files" >/dev/null +BASE_SHA="$(g_plain rev-parse HEAD)" + +echo "Creating feature branch with heavy AI history..." +g_plain checkout -B bench-feature "$BASE_SHA" >/dev/null +for i in $(seq 1 "$FEATURE_COMMITS"); do + if (( i % BURST_EVERY == 0 )); then + for f in $(seq 1 "$FILES"); do + generate_file "$REPO_DIR/bench/generated/file_${f}.txt" "$((50000 + i * 1000 + f))" "$LINES_PER_FILE" + done + else + f=$(( (i - 1) % FILES + 1 )) + generate_file "$REPO_DIR/bench/generated/file_${f}.txt" "$((50000 + i * 1000 + f))" "$LINES_PER_FILE" + fi + + run_ai_checkpoint + g_ai add -A bench/generated + g_ai commit -m "bench(ai): feature commit $i" >/dev/null + + if (( i % 25 == 0 || i == FEATURE_COMMITS )); then + echo " feature commits: $i/$FEATURE_COMMITS" + fi +done +FEATURE_TIP="$(g_plain rev-parse HEAD)" + +echo "Creating diverged target branch with upstream churn..." +g_plain checkout -B bench-main-diverged "$BASE_SHA" >/dev/null +for i in $(seq 1 "$MAIN_COMMITS"); do + uf=$(( (i - 1) % 3 + 1 )) + generate_file "$REPO_DIR/bench/upstream/upstream_${uf}.txt" "$((900000 + i))" "$((LINES_PER_FILE / 2))" + g_plain add -A bench/upstream + g_plain commit -m "bench(main): upstream commit $i" >/dev/null + if (( i % 20 == 0 || i == MAIN_COMMITS )); then + echo " main commits: $i/$MAIN_COMMITS" + fi +done +MAIN_DIVERGED_TIP="$(g_plain rev-parse HEAD)" + +echo -e "scenario\tmode\tstatus\tmerge_s\tcommit_s\ttotal_s\tmerge_git_ms\tmerge_pre_ms\tmerge_post_ms\tcommit_git_ms\tcommit_pre_ms\tcommit_post_ms\thead_note\tsource\ttarget" >"$RESULTS_TSV" +{ + echo "git-ai nasty squash benchmark summary" + echo "repo_url: $REPO_URL" + echo "repo_dir: $REPO_DIR" + echo "default_branch: $DEFAULT_BRANCH" + echo "base_sha: $BASE_SHA" + echo "feature_tip: $FEATURE_TIP" + echo "main_diverged_tip: $MAIN_DIVERGED_TIP" + echo "feature_commits: $FEATURE_COMMITS" + echo "main_commits: $MAIN_COMMITS" + echo "files: $FILES" + echo "lines_per_file: $LINES_PER_FILE" + echo "burst_every: $BURST_EVERY" + echo "plain_git_bin: $PLAIN_GIT_BIN" + echo "git_bin: $GIT_BIN" + echo +} >"$SUMMARY_FILE" + +echo +echo "Running squash scenarios..." +run_mode_squash "linear" "plain" "bench-feature" "$BASE_SHA" +run_mode_squash "linear" "ai" "bench-feature" "$BASE_SHA" + +run_mode_squash "diverged" "plain" "bench-feature" "$MAIN_DIVERGED_TIP" +run_mode_squash "diverged" "ai" "bench-feature" "$MAIN_DIVERGED_TIP" + +echo +echo "=== Benchmark complete ===" +echo "Summary: $SUMMARY_FILE" +echo "Results TSV: $RESULTS_TSV" +echo "Logs dir: $LOG_DIR" +column -t -s $'\t' "$RESULTS_TSV" || cat "$RESULTS_TSV" + +echo +echo "== Slowdown summary (ai vs plain) ==" +python3 - "$RESULTS_TSV" <<'PY' +import csv +import sys + +path = sys.argv[1] +rows = list(csv.DictReader(open(path, "r", encoding="utf-8"), delimiter="\t")) +by = {} +for r in rows: + key = r["scenario"] + by.setdefault(key, {})[r["mode"]] = r + +for scenario, modes in by.items(): + if "plain" not in modes or "ai" not in modes: + continue + p = modes["plain"] + a = modes["ai"] + try: + plain_total = float(p["total_s"]) + ai_total = float(a["total_s"]) + plain_merge = float(p["merge_s"]) + ai_merge = float(a["merge_s"]) + plain_commit = float(p["commit_s"]) + ai_commit = float(a["commit_s"]) + except Exception: + continue + + ratio_total = ai_total / plain_total if plain_total > 0 else float("inf") + ratio_merge = ai_merge / plain_merge if plain_merge > 0 else float("inf") + ratio_commit = ai_commit / plain_commit if plain_commit > 0 else float("inf") + + merge_post = a.get("merge_post_ms", "-") + commit_post = a.get("commit_post_ms", "-") + print( + f"{scenario}: total={ratio_total:.1f}x " + f"(merge={ratio_merge:.1f}x, commit={ratio_commit:.1f}x), " + f"ai merge_post_ms={merge_post}, ai commit_post_ms={commit_post}" + ) +PY diff --git a/src/authorship/post_commit.rs b/src/authorship/post_commit.rs index 6c18c8ce4..15754b4b8 100644 --- a/src/authorship/post_commit.rs +++ b/src/authorship/post_commit.rs @@ -1,7 +1,7 @@ use crate::api::{ApiClient, ApiContext}; use crate::authorship::authorship_log_serialization::AuthorshipLog; use crate::authorship::ignore::{ - build_ignore_matcher, effective_ignore_patterns, should_ignore_file_with_matcher, + IgnoreMatcher, build_ignore_matcher, effective_ignore_patterns, should_ignore_file_with_matcher, }; use crate::authorship::prompt_utils::{PromptUpdateResult, update_prompt_from_tool}; use crate::authorship::secrets::{redact_secrets_from_prompts, strip_prompt_messages}; @@ -15,6 +15,7 @@ use crate::git::repository::Repository; use crate::utils::debug_log; use std::collections::{HashMap, HashSet}; use std::io::IsTerminal; +use std::time::Instant; /// Skip expensive post-commit stats when this threshold is exceeded. /// High hunk density is the strongest predictor of slow diff_ai_accepted_stats. @@ -56,6 +57,7 @@ pub fn post_commit( human_author: String, supress_output: bool, ) -> Result<(String, AuthorshipLog), GitAiError> { + let post_commit_start = Instant::now(); // Use base_commit parameter if provided, otherwise use "initial" for empty repos // This matches the convention in checkpoint.rs let parent_sha = base_commit.unwrap_or_else(|| "initial".to_string()); @@ -63,45 +65,67 @@ pub fn post_commit( // Initialize the new storage system let repo_storage = &repo.storage; let working_log = repo_storage.working_log_for_base_commit(&parent_sha); + let squash_precommit_skipped = working_log.was_squash_precommit_skipped(); + let initial_read_start = Instant::now(); + let initial_attributions_for_pathspecs = working_log.read_initial_attributions(); + debug_log(&format!( + "[BENCHMARK] post-commit: read initial attributions took {:?}", + initial_read_start.elapsed() + )); + + // Pull all working log entries from the parent commit. + // For squash commits where pre-commit checkpoint was intentionally skipped, + // only INITIAL data is relevant and we can avoid checkpoint I/O entirely. + let mut parent_working_log = if squash_precommit_skipped { + Vec::new() + } else { + working_log.read_all_checkpoints()? + }; - // Pull all working log entries from the parent commit - - let mut parent_working_log = working_log.read_all_checkpoints()?; - - // debug_log(&format!( - // "edited files: {:?}", - // parent_working_log.edited_files - // )); - - // Update prompts/transcripts to their latest versions and persist to disk - // Do this BEFORE filtering so that all checkpoints (including untracked files) are updated - update_prompts_to_latest(&mut parent_working_log)?; - - // Batch upsert all prompts to database after refreshing (non-fatal if it fails) - if let Err(e) = batch_upsert_prompts_to_db(&parent_working_log, &working_log, &commit_sha) { - debug_log(&format!( - "[Warning] Failed to batch upsert prompts to database: {}", - e - )); - crate::observability::log_error( - &e, - Some(serde_json::json!({ - "operation": "post_commit_batch_upsert", - "commit_sha": commit_sha - })), - ); - } + if !squash_precommit_skipped { + // Update prompts/transcripts to their latest versions and persist to disk + // Do this BEFORE filtering so that all checkpoints (including untracked files) are updated + update_prompts_to_latest(&mut parent_working_log)?; + + // Batch upsert all prompts to database after refreshing (non-fatal if it fails) + if let Err(e) = batch_upsert_prompts_to_db(&parent_working_log, &working_log, &commit_sha) { + debug_log(&format!( + "[Warning] Failed to batch upsert prompts to database: {}", + e + )); + crate::observability::log_error( + &e, + Some(serde_json::json!({ + "operation": "post_commit_batch_upsert", + "commit_sha": commit_sha + })), + ); + } - working_log.write_all_checkpoints(&parent_working_log)?; + working_log.write_all_checkpoints(&parent_working_log)?; + } - // Create VirtualAttributions from working log (fast path - no blame) - // We don't need to run blame because we only care about the working log data - // that was accumulated since the parent commit - let working_va = VirtualAttributions::from_just_working_log( - repo.clone(), - parent_sha.clone(), - Some(human_author.clone()), - )?; + // Create VirtualAttributions from working log (fast path - no blame). + // Post-commit conversion only needs line-level attributions, so use the + // lightweight loader to avoid eagerly reading full file contents. + let va_start = Instant::now(); + let working_va = if squash_precommit_skipped { + VirtualAttributions::from_initial_only_line_only( + repo.clone(), + parent_sha.clone(), + Some(human_author.clone()), + )? + } else { + VirtualAttributions::from_just_working_log_line_only( + repo.clone(), + parent_sha.clone(), + Some(human_author.clone()), + )? + }; + debug_log(&format!( + "[BENCHMARK] post-commit: load working VA took {:?}", + va_start.elapsed() + )); // Build pathspecs from AI-relevant checkpoint entries only. // Human-only entries with no AI attribution do not affect authorship output and should not @@ -118,83 +142,142 @@ pub fn post_commit( // Also include files from INITIAL attributions (uncommitted files from previous commits) // These files may not have checkpoints but still need their attribution preserved // when they are finally committed. See issue #356. - let initial_attributions_for_pathspecs = working_log.read_initial_attributions(); for file_path in initial_attributions_for_pathspecs.files.keys() { pathspecs.insert(file_path.clone()); } // Split VirtualAttributions into committed (authorship log) and uncommitted (INITIAL) - let (mut authorship_log, initial_attributions) = working_va - .to_authorship_log_and_initial_working_log( + // Fast path: if there are no relevant worktree changes after commit, we can skip + // expensive unstaged diff processing and convert directly from committed hunks. + let status_start = Instant::now(); + let has_relevant_worktree_changes = + !pathspecs.is_empty() && !repo.status(Some(&pathspecs), false)?.is_empty(); + debug_log(&format!( + "[BENCHMARK] post-commit: status scan took {:?} (has_changes={})", + status_start.elapsed(), + has_relevant_worktree_changes + )); + + let conversion_start = Instant::now(); + let (mut authorship_log, initial_attributions) = if has_relevant_worktree_changes { + working_va.to_authorship_log_and_initial_working_log( + repo, + &parent_sha, + &commit_sha, + Some(&pathspecs), + )? + } else { + let authorship_log = working_va.to_authorship_log_index_only( repo, &parent_sha, &commit_sha, Some(&pathspecs), )?; + let initial_attributions = crate::git::repo_storage::InitialAttributions { + files: HashMap::new(), + prompts: HashMap::new(), + }; + (authorship_log, initial_attributions) + }; + debug_log(&format!( + "[BENCHMARK] post-commit: attribution conversion took {:?}", + conversion_start.elapsed() + )); authorship_log.metadata.base_commit_sha = commit_sha.clone(); + let prompt_policy_start = Instant::now(); // Handle prompts based on effective prompt storage mode for this repository // The effective mode considers include/exclude lists and fallback settings let effective_storage = Config::get().effective_prompt_storage(&Some(repo.clone())); + let has_prompt_messages = authorship_log + .metadata + .prompts + .values() + .any(|prompt| !prompt.messages.is_empty()); match effective_storage { PromptStorageMode::Local => { // Local only: strip all messages from notes (they stay in sqlite only) - strip_prompt_messages(&mut authorship_log.metadata.prompts); + if has_prompt_messages { + strip_prompt_messages(&mut authorship_log.metadata.prompts); + } } PromptStorageMode::Notes => { // Store in notes: redact secrets but keep messages in notes - let count = redact_secrets_from_prompts(&mut authorship_log.metadata.prompts); - if count > 0 { - debug_log(&format!("Redacted {} secrets from prompts", count)); + if has_prompt_messages { + let count = redact_secrets_from_prompts(&mut authorship_log.metadata.prompts); + if count > 0 { + debug_log(&format!("Redacted {} secrets from prompts", count)); + } } } PromptStorageMode::Default => { // "default" - attempt CAS upload, NEVER keep messages in notes - // Check conditions for CAS upload: - // - user is logged in OR using custom API URL - let context = ApiContext::new(None); - let client = ApiClient::new(context); - let using_custom_api = - Config::get().api_base_url() != crate::config::DEFAULT_API_BASE_URL; - let should_enqueue_cas = client.is_logged_in() || using_custom_api; - - if should_enqueue_cas { - // Redact secrets before uploading to CAS - let redaction_count = - redact_secrets_from_prompts(&mut authorship_log.metadata.prompts); - if redaction_count > 0 { - debug_log(&format!( - "Redacted {} secrets from prompts before CAS upload", - redaction_count - )); - } + if has_prompt_messages { + // Check conditions for CAS upload: + // - user is logged in OR using custom API URL + // - squash fast-path is always false (prompts are inherited) + let should_enqueue_cas = if squash_precommit_skipped { + false + } else { + let context = ApiContext::new(None); + let client = ApiClient::new(context); + let using_custom_api = + Config::get().api_base_url() != crate::config::DEFAULT_API_BASE_URL; + client.is_logged_in() || using_custom_api + }; + + if should_enqueue_cas { + // Redact secrets before uploading to CAS + let redaction_count = + redact_secrets_from_prompts(&mut authorship_log.metadata.prompts); + if redaction_count > 0 { + debug_log(&format!( + "Redacted {} secrets from prompts before CAS upload", + redaction_count + )); + } - if let Err(e) = - enqueue_prompt_messages_to_cas(repo, &mut authorship_log.metadata.prompts) - { - debug_log(&format!( - "[Warning] Failed to enqueue prompt messages to CAS: {}", - e - )); - // Enqueue failed - still strip messages (never keep in notes for "default") + if let Err(e) = + enqueue_prompt_messages_to_cas(repo, &mut authorship_log.metadata.prompts) + { + debug_log(&format!( + "[Warning] Failed to enqueue prompt messages to CAS: {}", + e + )); + // Enqueue failed - still strip messages (never keep in notes for "default") + strip_prompt_messages(&mut authorship_log.metadata.prompts); + } + // Success: enqueue function already cleared messages + } else { + // Not enqueueing - strip messages (never keep in notes for "default") strip_prompt_messages(&mut authorship_log.metadata.prompts); } - // Success: enqueue function already cleared messages - } else { - // Not enqueueing - strip messages (never keep in notes for "default") - strip_prompt_messages(&mut authorship_log.metadata.prompts); } } } + debug_log(&format!( + "[BENCHMARK] post-commit: prompt policy handling took {:?}", + prompt_policy_start.elapsed() + )); // Serialize the authorship log + let serialize_start = Instant::now(); let authorship_json = authorship_log .serialize_to_string() .map_err(|_| GitAiError::Generic("Failed to serialize authorship log".to_string()))?; + debug_log(&format!( + "[BENCHMARK] post-commit: serialize authorship took {:?}", + serialize_start.elapsed() + )); + let notes_start = Instant::now(); notes_add(repo, &commit_sha, &authorship_json)?; + debug_log(&format!( + "[BENCHMARK] post-commit: notes_add took {:?}", + notes_start.elapsed() + )); // Compute stats once (needed for both metrics and terminal output), unless preflight // estimate predicts this would be too expensive for the commit hook path. @@ -204,6 +287,7 @@ pub fn post_commit( .map(|commit| commit.parent_count().unwrap_or(0) > 1) .unwrap_or(false); let ignore_patterns = effective_ignore_patterns(repo, &[], &[]); + let stats_preflight_start = Instant::now(); let skip_reason = if is_merge_commit { Some(StatsSkipReason::MergeCommit) } else { @@ -217,6 +301,10 @@ pub fn post_commit( } }) }; + debug_log(&format!( + "[BENCHMARK] post-commit: stats preflight took {:?}", + stats_preflight_start.elapsed() + )); if skip_reason.is_none() { let computed = stats_for_commit_stats(repo, &commit_sha, &ignore_patterns)?; @@ -260,7 +348,16 @@ pub fn post_commit( } // // Clean up old working log + let cleanup_start = Instant::now(); repo_storage.delete_working_log_for_base_commit(&parent_sha)?; + debug_log(&format!( + "[BENCHMARK] post-commit: cleanup old working log took {:?}", + cleanup_start.elapsed() + )); + debug_log(&format!( + "[BENCHMARK] post-commit: total post_commit duration {:?}", + post_commit_start.elapsed() + )); if !supress_output && !Config::get().is_quiet() { // Only print stats if we're in an interactive terminal and quiet mode is disabled @@ -309,26 +406,36 @@ fn estimate_stats_cost( commit_sha: &str, ignore_patterns: &[String], ) -> Result { - let mut added_lines_by_file = repo.diff_added_lines(parent_sha, commit_sha, None)?; let ignore_matcher = build_ignore_matcher(ignore_patterns); + // Use a cheap --numstat preflight first. On large commits this avoids parsing full + // patch hunks just to decide we should skip expensive stats. + let (files_with_additions, added_lines) = + get_numstat_added_lines(repo, parent_sha, commit_sha, &ignore_matcher)?; + + if files_with_additions >= STATS_SKIP_MAX_FILES_WITH_ADDITIONS + || added_lines >= STATS_SKIP_MAX_ADDED_LINES + { + return Ok(StatsCostEstimate { + files_with_additions, + added_lines, + hunk_ranges: 0, + }); + } + + // Only compute hunk density when needed (smaller commits near threshold). + let mut added_lines_by_file = repo.diff_added_lines(parent_sha, commit_sha, None)?; added_lines_by_file .retain(|file_path, _| !should_ignore_file_with_matcher(file_path, &ignore_matcher)); - let files_with_additions = added_lines_by_file .values() .filter(|lines| !lines.is_empty()) .count(); - - let mut added_lines = 0usize; - let mut hunk_ranges = 0usize; - - for (_file, lines) in added_lines_by_file { - if lines.is_empty() { - continue; - } - added_lines += lines.len(); - hunk_ranges += count_line_ranges(&lines); - } + let added_lines = added_lines_by_file.values().map(std::vec::Vec::len).sum(); + let hunk_ranges = added_lines_by_file + .values() + .filter(|lines| !lines.is_empty()) + .map(|lines| count_line_ranges(lines)) + .sum(); Ok(StatsCostEstimate { files_with_additions, @@ -357,6 +464,46 @@ fn count_line_ranges(lines: &[u32]) -> usize { ranges } +fn get_numstat_added_lines( + repo: &Repository, + parent_sha: &str, + commit_sha: &str, + ignore_matcher: &IgnoreMatcher, +) -> Result<(usize, usize), GitAiError> { + let mut args = repo.global_args_for_exec(); + args.push("diff".to_string()); + args.push("--numstat".to_string()); + args.push(parent_sha.to_string()); + args.push(commit_sha.to_string()); + + let output = crate::git::repository::exec_git(&args)?; + let stdout = String::from_utf8(output.stdout)?; + + let mut files_with_additions = 0usize; + let mut added_lines = 0usize; + + for line in stdout.lines() { + if line.trim().is_empty() { + continue; + } + let parts: Vec<&str> = line.split('\t').collect(); + if parts.len() < 3 { + continue; + } + if should_ignore_file_with_matcher(parts[2], ignore_matcher) { + continue; + } + if let Ok(added) = parts[0].parse::() + && added > 0 + { + files_with_additions += 1; + added_lines += added; + } + } + + Ok((files_with_additions, added_lines)) +} + /// Update prompts/transcripts in working log checkpoints to their latest versions. /// This helps prevent race conditions where we miss the last message in a conversation. /// @@ -638,8 +785,10 @@ fn record_commit_metrics( mod tests { use super::{ STATS_SKIP_MAX_ADDED_LINES, STATS_SKIP_MAX_FILES_WITH_ADDITIONS, STATS_SKIP_MAX_HUNKS, - StatsCostEstimate, count_line_ranges, should_skip_expensive_post_commit_stats, + StatsCostEstimate, checkpoint_entry_requires_post_processing, count_line_ranges, + should_skip_expensive_post_commit_stats, }; + use crate::authorship::working_log::{Checkpoint, CheckpointKind, WorkingLogEntry}; use crate::git::test_utils::TmpRepo; #[test] @@ -683,6 +832,69 @@ mod tests { assert!(should_skip_expensive_post_commit_stats(&by_files)); } + #[test] + fn test_checkpoint_entry_requires_post_processing_handles_human_and_ai_paths() { + let empty_human_checkpoint = Checkpoint::new( + CheckpointKind::Human, + String::new(), + "human".to_string(), + vec![], + ); + let empty_entry = WorkingLogEntry::new( + "file.txt".to_string(), + "blob".to_string(), + vec![], + vec![], + ); + assert!( + !checkpoint_entry_requires_post_processing(&empty_human_checkpoint, &empty_entry), + "human entry with no AI attribution should be skipped" + ); + + let ai_line_entry = WorkingLogEntry::new( + "file.txt".to_string(), + "blob".to_string(), + vec![], + vec![crate::authorship::attribution_tracker::LineAttribution { + start_line: 1, + end_line: 1, + author_id: "mock_ai".to_string(), + overrode: None, + }], + ); + assert!( + checkpoint_entry_requires_post_processing(&empty_human_checkpoint, &ai_line_entry), + "human entry with AI line attribution should be processed" + ); + + let overridden_entry = WorkingLogEntry::new( + "file.txt".to_string(), + "blob".to_string(), + vec![], + vec![crate::authorship::attribution_tracker::LineAttribution { + start_line: 1, + end_line: 1, + author_id: CheckpointKind::Human.to_str(), + overrode: Some("mock_ai".to_string()), + }], + ); + assert!( + checkpoint_entry_requires_post_processing(&empty_human_checkpoint, &overridden_entry), + "human overrides must be processed" + ); + + let ai_checkpoint = Checkpoint::new( + CheckpointKind::AiAgent, + String::new(), + "ai".to_string(), + vec![], + ); + assert!( + checkpoint_entry_requires_post_processing(&ai_checkpoint, &empty_entry), + "all AI checkpoints should be processed" + ); + } + #[test] fn test_post_commit_empty_repo_with_checkpoint() { // Create an empty repo (no commits yet) diff --git a/src/authorship/rebase_authorship.rs b/src/authorship/rebase_authorship.rs index 03a648083..429478870 100644 --- a/src/authorship/rebase_authorship.rs +++ b/src/authorship/rebase_authorship.rs @@ -11,6 +11,7 @@ use crate::git::repository::{CommitRange, Repository, exec_git, exec_git_stdin}; use crate::git::rewrite_log::RewriteLogEvent; use crate::utils::{debug_log, debug_performance_log}; use std::collections::{BTreeMap, HashMap, HashSet}; +use std::time::Instant; #[derive(Clone, Copy, Default)] struct PromptLineMetrics { @@ -38,7 +39,6 @@ pub fn rewrite_authorship_if_needed( repo: &Repository, last_event: &RewriteLogEvent, commit_author: String, - _full_log: &Vec, supress_output: bool, ) -> Result<(), GitAiError> { match last_event { @@ -136,12 +136,29 @@ pub fn prepare_working_log_after_squash( target_branch_head_sha: &str, _human_author: &str, ) -> Result<(), GitAiError> { + const SMALL_STAGED_SOURCE_FILTER_BYPASS_MAX_FILES: usize = 64; + use crate::authorship::virtual_attribution::{ VirtualAttributions, merge_attributions_favoring_first, }; - // Step 1: Find merge base between source and target to optimize blame - // We only need to look at commits after the merge base, not entire history + let prep_start = Instant::now(); + + // Step 0: Only consider files actually staged by merge --squash. + // Diffing source/target heads pulls in unrelated branch churn and explodes work. + let mut staged_files: Vec = repo.get_staged_filenames()?.into_iter().collect(); + staged_files.sort(); + if staged_files.is_empty() { + return Ok(()); + } + debug_log(&format!( + "[BENCHMARK] squash prep: staged files discovery took {:?} ({} files)", + prep_start.elapsed(), + staged_files.len() + )); + + // Step 1: Find merge base between source and target to bound history walks. + let history_start = Instant::now(); let merge_base = repo .merge_base( source_head_sha.to_string(), @@ -149,67 +166,228 @@ pub fn prepare_working_log_after_squash( ) .ok(); - // Step 2: Get list of changed files between the two branches - let changed_files = repo.diff_changed_files(source_head_sha, target_branch_head_sha)?; + let source_commits = + commits_in_range_with_merge_base(repo, merge_base.as_ref(), source_head_sha)?; + let target_commits = + commits_in_range_with_merge_base(repo, merge_base.as_ref(), target_branch_head_sha)?; + + // Only run expensive attribution logic on staged files that were ever touched by AI + // on each side of the merge base. + // + // Optimization: for small staged sets, skip source-side touched-file extraction and + // just blame staged files directly when source has any authorship notes. + let source_ai_files = if staged_files.len() <= SMALL_STAGED_SOURCE_FILTER_BYPASS_MAX_FILES { + if commits_have_authorship_notes(repo, &source_commits)? { + staged_files.clone() + } else { + Vec::new() + } + } else { + filter_pathspecs_to_ai_touched_files(repo, &source_commits, &staged_files)? + }; - if changed_files.is_empty() { - // No files changed, nothing to do + let target_ai_files = if commits_have_authorship_notes(repo, &target_commits)? { + filter_pathspecs_to_ai_touched_files(repo, &target_commits, &staged_files)? + } else { + Vec::new() + }; + debug_log(&format!( + "[BENCHMARK] squash prep: history+AI file filtering took {:?} (source_ai={}, target_ai={})", + history_start.elapsed(), + source_ai_files.len(), + target_ai_files.len() + )); + + if source_ai_files.is_empty() && target_ai_files.is_empty() { + debug_log("merge --squash staged files contain no AI-touched paths; skipping rewrite prep"); return Ok(()); } - // Step 3: Create VirtualAttributions for both branches - // Use merge_base to limit blame range for performance - let repo_clone = repo.clone(); - let merge_base_clone = merge_base.clone(); - let source_va = smol::block_on(async { - VirtualAttributions::new_for_base_commit( - repo_clone, - source_head_sha.to_string(), - &changed_files, - merge_base_clone, - ) - .await - })?; + let mut tracked_files_set: HashSet = source_ai_files.iter().cloned().collect(); + tracked_files_set.extend(target_ai_files.iter().cloned()); + let mut tracked_files: Vec = tracked_files_set.iter().cloned().collect(); + tracked_files.sort(); - let repo_clone = repo.clone(); - let target_va = smol::block_on(async { - VirtualAttributions::new_for_base_commit( - repo_clone, - target_branch_head_sha.to_string(), - &changed_files, - merge_base, - ) - .await - })?; + // Step 2: Create VirtualAttributions for each side only when needed. + // Skipping a side avoids N x blame calls and prompt scans. + let source_va_start = Instant::now(); + let mut source_va = if source_ai_files.is_empty() { + VirtualAttributions::empty(repo.clone(), source_head_sha.to_string()) + } else { + let repo_clone = repo.clone(); + let merge_base_clone = merge_base.clone(); + smol::block_on(async { + VirtualAttributions::new_for_base_commit_with_options( + repo_clone, + source_head_sha.to_string(), + &source_ai_files, + merge_base_clone, + false, + ) + .await + })? + }; + debug_log(&format!( + "[BENCHMARK] squash prep: source VA build took {:?}", + source_va_start.elapsed() + )); - // Step 3: Read staged files content (final state after squash) - let staged_files = repo.get_all_staged_files_content(&changed_files)?; + let target_va_start = Instant::now(); + let mut target_va = if target_ai_files.is_empty() { + VirtualAttributions::empty(repo.clone(), target_branch_head_sha.to_string()) + } else { + let repo_clone = repo.clone(); + smol::block_on(async { + VirtualAttributions::new_for_base_commit_with_options( + repo_clone, + target_branch_head_sha.to_string(), + &target_ai_files, + merge_base, + false, + ) + .await + })? + }; + debug_log(&format!( + "[BENCHMARK] squash prep: target VA build took {:?}", + target_va_start.elapsed() + )); + + // Squash rewrite does not require carrying transcript payloads through INITIAL. + // Dropping them here avoids expensive cloning/stripping in post-commit. + source_va.clear_prompt_messages(); + target_va.clear_prompt_messages(); + + // Step 3: Read staged content only for files that can carry AI attribution. + let staged_content_start = Instant::now(); + let staged_files = repo.get_all_staged_files_content(&tracked_files)?; + debug_log(&format!( + "[BENCHMARK] squash prep: staged content read took {:?}", + staged_content_start.elapsed() + )); // Step 4: Merge VirtualAttributions, favoring target branch (HEAD) - let merged_va = merge_attributions_favoring_first(target_va, source_va, staged_files)?; + let merge_va_start = Instant::now(); + let can_reuse_source_directly = target_ai_files.is_empty() + && source_va + .files() + .iter() + .all(|file_path| source_va.get_file_content(file_path) == staged_files.get(file_path)); + let merged_va = if can_reuse_source_directly { + source_va + } else { + merge_attributions_favoring_first(target_va, source_va, staged_files)? + }; + debug_log(&format!( + "[BENCHMARK] squash prep: merge_attributions took {:?}", + merge_va_start.elapsed() + )); // Step 5: Convert to INITIAL (everything is uncommitted in a squash) // Pass same SHA for parent and commit to get empty diff (no committed hunks) - let (_authorship_log, initial_attributions) = merged_va - .to_authorship_log_and_initial_working_log( - repo, - target_branch_head_sha, - target_branch_head_sha, - None, - )?; + let to_initial_start = Instant::now(); + let initial_attributions = if can_reuse_source_directly { + let files = merged_va + .attributions + .iter() + .filter_map(|(file_path, (_, line_attrs))| { + if line_attrs.is_empty() { + None + } else { + Some((file_path.clone(), compress_line_attributions(line_attrs))) + } + }) + .collect(); + let prompts = flatten_prompts_for_metadata(merged_va.prompts()) + .into_iter() + .collect::>(); + crate::git::repo_storage::InitialAttributions { files, prompts } + } else { + let (_authorship_log, initial_attributions) = merged_va + .to_authorship_log_and_initial_working_log( + repo, + target_branch_head_sha, + target_branch_head_sha, + Some(&tracked_files_set), + )?; + initial_attributions + }; + debug_log(&format!( + "[BENCHMARK] squash prep: VA->INITIAL conversion took {:?}", + to_initial_start.elapsed() + )); // Step 6: Write INITIAL file + let write_initial_start = Instant::now(); if !initial_attributions.files.is_empty() { let working_log = repo .storage .working_log_for_base_commit(target_branch_head_sha); working_log .write_initial_attributions(initial_attributions.files, initial_attributions.prompts)?; + if let Ok(tree_oid) = repo.index_tree_oid() { + let _ = working_log.write_squash_tree_oid(&tree_oid); + } } + debug_log(&format!( + "[BENCHMARK] squash prep: write INITIAL/tree marker took {:?}", + write_initial_start.elapsed() + )); + debug_log(&format!( + "[BENCHMARK] squash prep: total prepare_working_log_after_squash took {:?}", + prep_start.elapsed() + )); Ok(()) } +fn compress_line_attributions( + line_attrs: &[crate::authorship::attribution_tracker::LineAttribution], +) -> Vec { + if line_attrs.is_empty() { + return Vec::new(); + } + + let mut attrs = line_attrs.to_vec(); + attrs.sort_by(|a, b| { + a.start_line + .cmp(&b.start_line) + .then(a.end_line.cmp(&b.end_line)) + .then(a.author_id.cmp(&b.author_id)) + .then(a.overrode.cmp(&b.overrode)) + }); + + let mut compressed: Vec = + Vec::with_capacity(attrs.len()); + for attr in attrs { + if let Some(last) = compressed.last_mut() + && last.author_id == attr.author_id + && last.overrode == attr.overrode + && attr.start_line <= last.end_line.saturating_add(1) + { + last.end_line = last.end_line.max(attr.end_line); + } else { + compressed.push(attr); + } + } + + compressed +} + +fn commits_in_range_with_merge_base( + repo: &Repository, + merge_base: Option<&String>, + head_sha: &str, +) -> Result, GitAiError> { + if let Some(base_sha) = merge_base { + let range = + CommitRange::new_infer_refname(repo, base_sha.clone(), head_sha.to_string(), None)?; + Ok(range.all_commits()) + } else { + Ok(vec![head_sha.to_string()]) + } +} + /// Rewrite authorship after a squash or rebase merge performed in CI/GUI /// /// This handles the case where a squash merge or rebase merge was performed via SCM GUI, @@ -260,13 +438,8 @@ pub fn rewrite_authorship_after_squash_or_rebase( // Get commits from source branch (from source_head back to merge_base) // Uses git rev-list which safely handles the range without infinite walking - let source_commits = if let Some(ref base) = merge_base { - let range = - CommitRange::new_infer_refname(repo, base.clone(), source_head_sha.to_string(), None)?; - range.all_commits() - } else { - vec![source_head_sha.to_string()] - }; + let source_commits = + commits_in_range_with_merge_base(repo, merge_base.as_ref(), source_head_sha)?; let changed_files = filter_pathspecs_to_ai_touched_files(repo, &source_commits, &changed_files)?; @@ -297,6 +470,16 @@ pub fn rewrite_authorship_after_squash_or_rebase( changed_files.len() )); + let target_commits = + commits_in_range_with_merge_base(repo, merge_base.as_ref(), &target_branch_head_sha)?; + let target_ai_files = + filter_pathspecs_to_ai_touched_files(repo, &target_commits, &changed_files)?; + + let mut tracked_files_set: HashSet = changed_files.iter().cloned().collect(); + tracked_files_set.extend(target_ai_files.iter().cloned()); + let mut tracked_files: Vec = tracked_files_set.iter().cloned().collect(); + tracked_files.sort(); + // Step 4: Create VirtualAttributions for both branches // Use merge_base to limit blame range for performance let repo_clone = repo.clone(); @@ -311,19 +494,23 @@ pub fn rewrite_authorship_after_squash_or_rebase( .await })?; - let repo_clone = repo.clone(); - let target_va = smol::block_on(async { - VirtualAttributions::new_for_base_commit( - repo_clone, - target_branch_head_sha.clone(), - &changed_files, - merge_base, - ) - .await - })?; + let target_va = if target_ai_files.is_empty() { + VirtualAttributions::empty(repo.clone(), target_branch_head_sha.clone()) + } else { + let repo_clone = repo.clone(); + smol::block_on(async { + VirtualAttributions::new_for_base_commit( + repo_clone, + target_branch_head_sha.clone(), + &target_ai_files, + merge_base, + ) + .await + })? + }; // Step 4: Read committed files from merge commit (captures final state with conflict resolutions) - let committed_files = get_committed_files_content(repo, merge_commit_sha, &changed_files)?; + let committed_files = get_committed_files_content(repo, merge_commit_sha, &tracked_files)?; debug_log(&format!( "Read {} committed files from merge commit", @@ -2748,7 +2935,8 @@ fn transform_attributions_to_final_state( #[cfg(test)] mod tests { use super::{ - collect_changed_file_contents_from_diff, get_pathspecs_from_commits, + collect_changed_file_contents_from_diff, compress_line_attributions, + get_pathspecs_from_commits, parse_cat_file_batch_output_with_oids, transform_attributions_to_final_state, try_fast_path_rebase_note_remap, walk_commits_to_base, }; @@ -2975,6 +3163,73 @@ mod tests { ); } + #[test] + fn compress_line_attributions_merges_only_compatible_adjacent_ranges() { + let compressed = compress_line_attributions(&[ + LineAttribution { + start_line: 5, + end_line: 5, + author_id: "ai-1".to_string(), + overrode: None, + }, + LineAttribution { + start_line: 1, + end_line: 2, + author_id: "ai-1".to_string(), + overrode: None, + }, + LineAttribution { + start_line: 3, + end_line: 4, + author_id: "ai-1".to_string(), + overrode: None, + }, + LineAttribution { + start_line: 6, + end_line: 6, + author_id: "ai-1".to_string(), + overrode: Some("human".to_string()), + }, + LineAttribution { + start_line: 9, + end_line: 9, + author_id: "ai-2".to_string(), + overrode: None, + }, + LineAttribution { + start_line: 7, + end_line: 8, + author_id: "ai-2".to_string(), + overrode: None, + }, + ]); + + assert_eq!( + compressed, + vec![ + LineAttribution { + start_line: 1, + end_line: 5, + author_id: "ai-1".to_string(), + overrode: None, + }, + LineAttribution { + start_line: 6, + end_line: 6, + author_id: "ai-1".to_string(), + overrode: Some("human".to_string()), + }, + LineAttribution { + start_line: 7, + end_line: 9, + author_id: "ai-2".to_string(), + overrode: None, + }, + ], + "compression should merge contiguous ranges only when author/override match" + ); + } + #[test] fn fast_path_rebase_note_remap_copies_logs_when_tracked_blobs_match() { let repo = TmpRepo::new().expect("tmp repo"); diff --git a/src/authorship/virtual_attribution.rs b/src/authorship/virtual_attribution.rs index 350cf7aff..5bc66f01d 100644 --- a/src/authorship/virtual_attribution.rs +++ b/src/authorship/virtual_attribution.rs @@ -26,12 +26,46 @@ pub struct VirtualAttributions { } impl VirtualAttributions { + pub fn empty(repo: Repository, base_commit: String) -> Self { + let ts = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_millis(); + + Self { + repo, + base_commit, + attributions: HashMap::new(), + file_contents: HashMap::new(), + prompts: BTreeMap::new(), + ts, + blame_start_commit: None, + } + } + /// Create a new VirtualAttributions for the given base commit with initial pathspecs pub async fn new_for_base_commit( repo: Repository, base_commit: String, pathspecs: &[String], blame_start_commit: Option, + ) -> Result { + Self::new_for_base_commit_with_options( + repo, + base_commit, + pathspecs, + blame_start_commit, + true, + ) + .await + } + + pub async fn new_for_base_commit_with_options( + repo: Repository, + base_commit: String, + pathspecs: &[String], + blame_start_commit: Option, + discover_foreign_prompts: bool, ) -> Result { let ts = SystemTime::now() .duration_since(UNIX_EPOCH) @@ -53,8 +87,11 @@ impl VirtualAttributions { virtual_attrs.add_pathspecs_concurrent(pathspecs).await?; } - // After running blame, discover and load any missing prompts from blamed commits - virtual_attrs.discover_and_load_foreign_prompts().await?; + if discover_foreign_prompts { + // After running blame, discover and load any missing prompts from blamed commits. + // Some flows (like squash preparation) can skip this and rely on existing metadata. + virtual_attrs.discover_and_load_foreign_prompts().await?; + } Ok(virtual_attrs) } @@ -213,12 +250,15 @@ impl VirtualAttributions { let results = futures::future::join_all(tasks).await; // Process results and store in HashMap + let mut pending_line_attrs: Vec<( + String, + Vec, + HashMap, + )> = Vec::new(); for result in results { match result { - Ok(Some((file_path, content, char_attrs, line_attrs))) => { - self.attributions - .insert(file_path.clone(), (char_attrs, line_attrs)); - self.file_contents.insert(file_path, content); + Ok(Some((file_path, line_attrs, prompt_records))) => { + pending_line_attrs.push((file_path, line_attrs, prompt_records)) } Ok(None) => { // File had no changes or couldn't be processed, skip @@ -227,6 +267,34 @@ impl VirtualAttributions { } } + if pending_line_attrs.is_empty() { + return Ok(()); + } + + let files_to_hydrate: Vec = pending_line_attrs + .iter() + .map(|(file_path, _, _)| file_path.clone()) + .collect(); + let file_contents = self + .repo + .get_files_content_at_commit(&self.base_commit, &files_to_hydrate)?; + + for (file_path, line_attrs, prompt_records) in pending_line_attrs { + let file_content = file_contents.get(&file_path).cloned().unwrap_or_default(); + let char_attrs = line_attributions_to_attributions(&line_attrs, &file_content, self.ts); + + self.attributions + .insert(file_path.clone(), (char_attrs, line_attrs)); + self.file_contents.insert(file_path, file_content); + + for (prompt_id, prompt_record) in prompt_records { + self.prompts + .entry(prompt_id) + .or_default() + .insert(self.base_commit.clone(), prompt_record); + } + } + Ok(()) } @@ -296,10 +364,46 @@ impl VirtualAttributions { repo: Repository, base_commit: String, human_author: Option, + ) -> Result { + Self::from_just_working_log_internal(repo, base_commit, human_author, true, true) + } + + /// Lightweight variant of `from_just_working_log` that preserves line-level attribution + /// and prompt metadata without eagerly hydrating file contents / char-level ranges. + /// This is enough for post-commit conversion and avoids large file reads on wide commits. + pub fn from_just_working_log_line_only( + repo: Repository, + base_commit: String, + human_author: Option, + ) -> Result { + Self::from_just_working_log_internal(repo, base_commit, human_author, false, true) + } + + /// Build VirtualAttributions from INITIAL data only (skip checkpoints). + /// Useful when squash pre-commit was intentionally skipped and only INITIAL + /// should participate in post-commit conversion. + pub fn from_initial_only_line_only( + repo: Repository, + base_commit: String, + human_author: Option, + ) -> Result { + Self::from_just_working_log_internal(repo, base_commit, human_author, false, false) + } + + fn from_just_working_log_internal( + repo: Repository, + base_commit: String, + human_author: Option, + hydrate_file_contents: bool, + include_checkpoints: bool, ) -> Result { let working_log = repo.storage.working_log_for_base_commit(&base_commit); let initial_attributions = working_log.read_initial_attributions(); - let checkpoints = working_log.read_all_checkpoints().unwrap_or_default(); + let checkpoints = if include_checkpoints { + working_log.read_all_checkpoints().unwrap_or_default() + } else { + Vec::new() + }; let mut attributions: HashMap, Vec)> = HashMap::new(); @@ -321,19 +425,24 @@ impl VirtualAttributions { // Process INITIAL attributions for (file_path, line_attrs) in &initial_attributions.files { - // Get the latest file content from working directory - if let Ok(workdir) = repo.workdir() { - let abs_path = workdir.join(file_path); - let file_content = if abs_path.exists() { - std::fs::read_to_string(&abs_path).unwrap_or_default() - } else { - String::new() - }; - file_contents.insert(file_path.clone(), file_content.clone()); + if hydrate_file_contents { + // Get the latest file content from working directory + if let Ok(workdir) = repo.workdir() { + let abs_path = workdir.join(file_path); + let file_content = if abs_path.exists() { + std::fs::read_to_string(&abs_path).unwrap_or_default() + } else { + String::new() + }; + file_contents.insert(file_path.clone(), file_content.clone()); - // Convert line attributions to character attributions - let char_attrs = line_attributions_to_attributions(line_attrs, &file_content, 0); - attributions.insert(file_path.clone(), (char_attrs, line_attrs.clone())); + // Convert line attributions to character attributions + let char_attrs = + line_attributions_to_attributions(line_attrs, &file_content, 0); + attributions.insert(file_path.clone(), (char_attrs, line_attrs.clone())); + } + } else { + attributions.insert(file_path.clone(), (Vec::new(), line_attrs.clone())); } } @@ -384,8 +493,10 @@ impl VirtualAttributions { continue; } - // Get the latest file content from working directory - if let Ok(workdir) = repo.workdir() { + // Get the latest file content from working directory when needed. + if (hydrate_file_contents || entry.line_attributions.is_empty()) + && let Ok(workdir) = repo.workdir() + { let abs_path = workdir.join(&entry.file); let file_content = if abs_path.exists() { std::fs::read_to_string(&abs_path).unwrap_or_default() @@ -411,7 +522,11 @@ impl VirtualAttributions { continue; } - let char_attrs = line_attributions_to_attributions(&line_attrs, &file_content, 0); + let char_attrs = if hydrate_file_contents { + line_attributions_to_attributions(&line_attrs, &file_content, 0) + } else { + Vec::new() + }; attributions.insert(entry.file.clone(), (char_attrs, line_attrs)); } @@ -1051,82 +1166,56 @@ impl VirtualAttributions { None => continue, // No committed hunks for this file, skip }; - // Map author_id -> line numbers (in commit coordinates) - let mut committed_lines_map: StdHashMap> = StdHashMap::new(); + // Map author_id -> committed ranges (in commit coordinates). + // This avoids expanding ranges to individual lines for large commits. + let mut committed_ranges_map: StdHashMap> = StdHashMap::new(); for line_attr in line_attrs { - // Since we're not dealing with unstaged hunks, the line numbers in VirtualAttributions - // are already in the right coordinates (working log coordinates = commit coordinates) - for line_num in line_attr.start_line..=line_attr.end_line { - // Check if this line is in any committed hunk - let is_committed = file_committed_hunks - .iter() - .any(|hunk| hunk.contains(line_num)); - - if is_committed { - committed_lines_map + // Skip human attributions - we only track AI attributions in the output + if line_attr.author_id == CheckpointKind::Human.to_str() { + continue; + } + + let attr_start = line_attr.start_line; + let attr_end = line_attr.end_line; + + for hunk in file_committed_hunks { + let (hunk_start, hunk_end) = line_range_to_bounds(hunk); + let intersection_start = attr_start.max(hunk_start); + let intersection_end = attr_end.min(hunk_end); + + if intersection_start <= intersection_end { + committed_ranges_map .entry(line_attr.author_id.clone()) .or_default() - .push(line_num); + .push((intersection_start, intersection_end)); } } } // Add committed attributions to authorship log - if !committed_lines_map.is_empty() { - // Create attestation entries from committed lines - for (author_id, mut lines) in committed_lines_map { - // Skip human attributions - we only track AI attributions in the output - if author_id == CheckpointKind::Human.to_str() { - continue; - } - - lines.sort(); - lines.dedup(); - - if lines.is_empty() { + if !committed_ranges_map.is_empty() { + for (author_id, ranges) in committed_ranges_map { + let merged_ranges = merge_u32_ranges(ranges); + if merged_ranges.is_empty() { continue; } - // Create line ranges - let mut ranges = Vec::new(); - let mut range_start = lines[0]; - let mut range_end = lines[0]; - - for &line in &lines[1..] { - if line == range_end + 1 { - range_end = line; - } else { - if range_start == range_end { - ranges.push(crate::authorship::authorship_log::LineRange::Single( - range_start, - )); + let line_ranges = merged_ranges + .into_iter() + .map(|(start, end)| { + if start == end { + crate::authorship::authorship_log::LineRange::Single(start) } else { - ranges.push(crate::authorship::authorship_log::LineRange::Range( - range_start, - range_end, - )); + crate::authorship::authorship_log::LineRange::Range(start, end) } - range_start = line; - range_end = line; - } - } - - // Add the last range - if range_start == range_end { - ranges.push(crate::authorship::authorship_log::LineRange::Single( - range_start, - )); - } else { - ranges.push(crate::authorship::authorship_log::LineRange::Range( - range_start, - range_end, - )); - } + }) + .collect(); let entry = crate::authorship::authorship_log_serialization::AttestationEntry::new( - author_id, ranges, + author_id, + line_ranges, ); let file_attestation = authorship_log.get_or_create_file(file_path); @@ -1313,6 +1402,15 @@ impl VirtualAttributions { ); } } + + /// Drop inline prompt messages while preserving prompt IDs and metrics. + pub fn clear_prompt_messages(&mut self) { + for commits in self.prompts.values_mut() { + for prompt_record in commits.values_mut() { + prompt_record.messages.clear(); + } + } + } } /// Merge two VirtualAttributions, favoring the primary for overlaps pub fn merge_attributions_favoring_first( @@ -1664,9 +1762,9 @@ fn compute_attributions_for_file( repo: &Repository, base_commit: &str, file_path: &str, - ts: u128, + _ts: u128, blame_start_commit: Option, -) -> Result, Vec)>, GitAiError> { +) -> Result, HashMap)>, GitAiError> { // Set up blame options let mut ai_blame_opts = GitAiBlameOptions::default(); #[allow(clippy::field_reassign_with_default)] @@ -1674,6 +1772,9 @@ fn compute_attributions_for_file( ai_blame_opts.no_output = true; ai_blame_opts.return_human_authors_as_human = true; ai_blame_opts.use_prompt_hashes_as_names = true; + // This path only needs line->prompt attribution, not per-hunk human-author metadata. + // Skipping this avoids an extra full authorship lookup pass in blame_hunks. + ai_blame_opts.populate_ai_human_authors = false; ai_blame_opts.newest_commit = Some(base_commit.to_string()); ai_blame_opts.oldest_commit = blame_start_commit; ai_blame_opts.oldest_date = Some(*OLDEST_AI_BLAME_DATE); @@ -1683,7 +1784,7 @@ fn compute_attributions_for_file( let ai_blame = repo.blame(file_path, &ai_blame_opts); match ai_blame { - Ok((blames, _)) => { + Ok((blames, prompt_records)) => { // Convert blame results to line attributions let mut line_attributions = Vec::new(); for (line, author) in blames { @@ -1699,19 +1800,10 @@ fn compute_attributions_for_file( }); } - // Get the file content at this commit to convert to character attributions - // We need to read the file content that blame operated on - let file_content = get_file_content_at_commit(repo, base_commit, file_path)?; - - // Convert line attributions to character attributions - let char_attributions = - line_attributions_to_attributions(&line_attributions, &file_content, ts); - Ok(Some(( file_path.to_string(), - file_content, - char_attributions, line_attributions, + prompt_records, ))) } Err(_) => { @@ -1721,25 +1813,31 @@ fn compute_attributions_for_file( } } -fn get_file_content_at_commit( - repo: &Repository, - commit_sha: &str, - file_path: &str, -) -> Result { - let commit = repo.find_commit(commit_sha.to_string())?; - let tree = commit.tree()?; +fn line_range_to_bounds(range: &LineRange) -> (u32, u32) { + match range { + LineRange::Single(line) => (*line, *line), + LineRange::Range(start, end) => (*start, *end), + } +} - match tree.get_path(std::path::Path::new(file_path)) { - Ok(entry) => { - if let Ok(blob) = repo.find_blob(entry.id()) { - let blob_content = blob.content().unwrap_or_default(); - Ok(String::from_utf8_lossy(&blob_content).to_string()) - } else { - Ok(String::new()) +fn merge_u32_ranges(mut ranges: Vec<(u32, u32)>) -> Vec<(u32, u32)> { + if ranges.is_empty() { + return ranges; + } + + ranges.sort_by_key(|(start, end)| (*start, *end)); + + let mut merged: Vec<(u32, u32)> = Vec::with_capacity(ranges.len()); + for (start, end) in ranges { + match merged.last_mut() { + Some((_, last_end)) if start <= last_end.saturating_add(1) => { + *last_end = (*last_end).max(end); } + _ => merged.push((start, end)), } - Err(_) => Ok(String::new()), } + + merged } /// Check if a file exists in a commit's tree @@ -1757,6 +1855,8 @@ fn file_exists_in_commit( mod tests { use super::*; + use crate::authorship::authorship_log::PromptRecord; + use crate::authorship::working_log::AgentId; use crate::git::test_utils::TmpRepo; #[test] @@ -1811,4 +1911,309 @@ mod tests { assert!(!virtual_attributions.files().is_empty()); } + + #[test] + fn test_line_only_working_log_loader_matches_full_loader_output() { + let repo = TmpRepo::new().unwrap(); + + // Seed base commit. + repo.write_file("sample.txt", "base\n", true).unwrap(); + repo.trigger_checkpoint_with_ai("seed_ai", None, None) + .unwrap(); + repo.commit_with_message("seed commit").unwrap(); + let base_commit = repo.head_commit_sha().unwrap(); + + // Create pending AI-attributed changes on top of base. + repo.write_file("sample.txt", "base\nai line 1\nai line 2\n", true) + .unwrap(); + repo.trigger_checkpoint_with_ai("delta_ai", None, None) + .unwrap(); + + let full_va = VirtualAttributions::from_just_working_log( + repo.gitai_repo().clone(), + base_commit.clone(), + Some("Test User ".to_string()), + ) + .unwrap(); + let line_only_va = VirtualAttributions::from_just_working_log_line_only( + repo.gitai_repo().clone(), + base_commit.clone(), + Some("Test User ".to_string()), + ) + .unwrap(); + + let mut full_files = full_va.files(); + let mut line_only_files = line_only_va.files(); + full_files.sort(); + line_only_files.sort(); + assert_eq!(full_files, line_only_files); + assert_eq!(full_va.prompts(), line_only_va.prompts()); + + for file in &full_files { + let (full_char, full_line) = full_va.get_attributions(file).unwrap(); + let (line_only_char, line_only_line) = line_only_va.get_attributions(file).unwrap(); + + assert!( + !full_char.is_empty(), + "full loader should hydrate char ranges" + ); + assert!( + line_only_char.is_empty(), + "line-only loader should skip char range hydration" + ); + assert_eq!( + full_line, line_only_line, + "line-level attributions must match between loaders" + ); + } + + let (full_log, full_initial) = full_va + .to_authorship_log_and_initial_working_log( + repo.gitai_repo(), + &base_commit, + &base_commit, + None, + ) + .unwrap(); + let (line_only_log, line_only_initial) = line_only_va + .to_authorship_log_and_initial_working_log( + repo.gitai_repo(), + &base_commit, + &base_commit, + None, + ) + .unwrap(); + + assert_eq!(full_initial.files, line_only_initial.files); + assert_eq!(full_initial.prompts, line_only_initial.prompts); + assert_eq!( + full_log.serialize_to_string().unwrap(), + line_only_log.serialize_to_string().unwrap(), + "Authorship output must be identical between full and line-only loaders" + ); + } + + #[test] + fn test_initial_only_line_loader_ignores_checkpoints_and_keeps_initial_prompts() { + let repo = TmpRepo::new().unwrap(); + + repo.write_file("base.txt", "base\n", true).unwrap(); + repo.trigger_checkpoint_with_author("seed").unwrap(); + repo.commit_with_message("seed commit").unwrap(); + let base_commit = repo.head_commit_sha().unwrap(); + + // Create AI checkpoint data that should be ignored by from_initial_only_line_only. + repo.write_file("checkpoint_only.txt", "cp\nai cp line\n", true) + .unwrap(); + repo.trigger_checkpoint_with_ai("checkpoint_ai", None, None) + .unwrap(); + + // Write INITIAL attribution + prompt that must be preserved. + let working_log = repo + .gitai_repo() + .storage + .working_log_for_base_commit(&base_commit); + + let mut initial_files = HashMap::new(); + initial_files.insert( + "initial_only.txt".to_string(), + vec![LineAttribution { + start_line: 1, + end_line: 1, + author_id: "initial_ai".to_string(), + overrode: None, + }], + ); + + let mut initial_prompts = HashMap::new(); + initial_prompts.insert( + "initial_ai".to_string(), + PromptRecord { + agent_id: AgentId { + tool: "mock_tool".to_string(), + id: "initial-session".to_string(), + model: "mock-model".to_string(), + }, + human_author: Some("Test User ".to_string()), + messages: vec![], + total_additions: 5, + total_deletions: 2, + accepted_lines: 0, + overriden_lines: 0, + messages_url: None, + }, + ); + working_log + .write_initial_attributions(initial_files, initial_prompts) + .unwrap(); + + let va = VirtualAttributions::from_initial_only_line_only( + repo.gitai_repo().clone(), + base_commit.clone(), + Some("Test User ".to_string()), + ) + .unwrap(); + + let mut files = va.files(); + files.sort(); + assert_eq!( + files, + vec!["initial_only.txt".to_string()], + "initial-only loader must ignore checkpoint files" + ); + assert!( + va.get_line_attributions("initial_only.txt") + .is_some_and(|attrs| !attrs.is_empty()), + "initial attributions should be preserved" + ); + assert!( + va.prompts().contains_key("initial_ai"), + "initial prompts should be preserved" + ); + assert!( + !va.prompts().contains_key("checkpoint_ai"), + "checkpoint prompts should be ignored by initial-only loader" + ); + } + + #[test] + fn test_clear_prompt_messages_keeps_prompt_keys_and_metrics() { + let repo = TmpRepo::new().unwrap(); + let mut prompts = BTreeMap::new(); + let mut commits = BTreeMap::new(); + commits.insert( + "base".to_string(), + PromptRecord { + agent_id: AgentId { + tool: "mock_tool".to_string(), + id: "session".to_string(), + model: "mock-model".to_string(), + }, + human_author: Some("Test User ".to_string()), + messages: vec![ + crate::authorship::transcript::Message::user("hello".to_string(), None), + crate::authorship::transcript::Message::assistant("world".to_string(), None), + ], + total_additions: 10, + total_deletions: 3, + accepted_lines: 7, + overriden_lines: 1, + messages_url: None, + }, + ); + prompts.insert("mock_prompt".to_string(), commits); + + let mut va = VirtualAttributions::new_with_prompts( + repo.gitai_repo().clone(), + "base".to_string(), + HashMap::new(), + HashMap::new(), + prompts, + 0, + ); + + va.clear_prompt_messages(); + + let prompt = va + .prompts() + .get("mock_prompt") + .and_then(|commit_map| commit_map.get("base")) + .expect("prompt should still exist"); + assert!( + prompt.messages.is_empty(), + "messages should be cleared in-place" + ); + assert_eq!(prompt.total_additions, 10); + assert_eq!(prompt.total_deletions, 3); + assert_eq!(prompt.accepted_lines, 7); + assert_eq!(prompt.overriden_lines, 1); + } + + #[test] + fn test_index_only_matches_added_lines_for_full_file_ai_attr() { + let repo = TmpRepo::new().unwrap(); + + repo.write_file( + "range.txt", + "line 1\nline 2\nline 3\nline 4\nline 5\nline 6\n", + true, + ) + .unwrap(); + repo.trigger_checkpoint_with_author("seed").unwrap(); + repo.commit_with_message("seed").unwrap(); + let parent_sha = repo.head_commit_sha().unwrap(); + + let updated_content = "\ +line 1 +line 2 edited +line 3 +line 4 +line 4.5 inserted +line 5 +line 6 +line 7 inserted +"; + repo.write_file("range.txt", updated_content, true).unwrap(); + repo.trigger_checkpoint_with_author("editor").unwrap(); + repo.commit_with_message("update").unwrap(); + let commit_sha = repo.head_commit_sha().unwrap(); + + let total_lines = updated_content.lines().count() as u32; + let mut attributions = HashMap::new(); + attributions.insert( + "range.txt".to_string(), + ( + Vec::new(), + vec![LineAttribution { + start_line: 1, + end_line: total_lines, + author_id: "mock_ai".to_string(), + overrode: None, + }], + ), + ); + + let va = VirtualAttributions::new( + repo.gitai_repo().clone(), + commit_sha.clone(), + attributions, + HashMap::new(), + 0, + ); + + let authorship_log = va + .to_authorship_log_index_only(repo.gitai_repo(), &parent_sha, &commit_sha, None) + .unwrap(); + + let expected_added_lines = repo + .gitai_repo() + .diff_added_lines(&parent_sha, &commit_sha, None) + .unwrap() + .get("range.txt") + .cloned() + .unwrap_or_default(); + + let mut actual_added_lines: Vec = authorship_log + .attestations + .iter() + .find(|entry| entry.file_path == "range.txt") + .map(|file_attestation| { + file_attestation + .entries + .iter() + .flat_map(|entry| entry.line_ranges.iter()) + .flat_map(|range| match range { + crate::authorship::authorship_log::LineRange::Single(line) => vec![*line], + crate::authorship::authorship_log::LineRange::Range(start, end) => { + (*start..=*end).collect() + } + }) + .collect() + }) + .unwrap_or_default(); + actual_added_lines.sort_unstable(); + actual_added_lines.dedup(); + + assert_eq!(actual_added_lines, expected_added_lines); + } } diff --git a/src/commands/blame.rs b/src/commands/blame.rs index 52cf04392..f377865af 100644 --- a/src/commands/blame.rs +++ b/src/commands/blame.rs @@ -1,8 +1,8 @@ use crate::authorship::authorship_log::PromptRecord; -use crate::authorship::authorship_log_serialization::AuthorshipLog; +use crate::authorship::authorship_log_serialization::{AUTHORSHIP_LOG_VERSION, AuthorshipLog}; use crate::authorship::working_log::CheckpointKind; use crate::error::GitAiError; -use crate::git::refs::get_reference_as_authorship_log_v3; +use crate::git::refs::{get_reference_as_authorship_log_v3, note_blob_oids_for_commits}; use crate::git::repository::Repository; use crate::git::repository::{exec_git, exec_git_stdin}; #[cfg(windows)] @@ -141,6 +141,9 @@ pub struct GitAiBlameOptions { // When true, a single git blame hunk may be split into multiple hunks // if different lines were authored by different humans working with AI pub split_hunks_by_ai_author: bool, + // Populate per-hunk `ai_human_author` metadata in `blame_hunks`. + // This is only needed for human-facing blame outputs. + pub populate_ai_human_authors: bool, } impl Default for GitAiBlameOptions { @@ -185,6 +188,7 @@ impl Default for GitAiBlameOptions { json: false, mark_unknown: false, split_hunks_by_ai_author: true, + populate_ai_human_authors: true, } } } @@ -269,83 +273,92 @@ impl Repository { options.clone() }; - // Read file content from one of: - // 1. Provided contents_data (from --contents flag) - // 2. A specific commit - // 3. The working directory - let (file_content, total_lines) = if let Some(ref data) = options.contents_data { - // Use pre-read contents data (from --contents stdin or file) - let content = String::from_utf8_lossy(data).to_string(); - let lines_count = content.lines().count() as u32; - (content, lines_count) - } else if let Some(ref commit) = options.newest_commit { - // Read file content from the specified commit - // This ensures blame is independent of which branch is checked out - let commit_obj = self.find_commit(commit.clone())?; - let tree = commit_obj.tree()?; - - match tree.get_path(std::path::Path::new(&relative_file_path)) { - Ok(entry) => { - if let Ok(blob) = self.find_blob(entry.id()) { - let blob_content = blob.content().unwrap_or_default(); - let content = String::from_utf8_lossy(&blob_content).to_string(); - let lines_count = content.lines().count() as u32; - (content, lines_count) - } else { + let fast_full_file_no_output = options.no_output && options.line_ranges.is_empty(); + let mut file_content = String::new(); + let mut line_ranges: Vec<(u32, u32)> = Vec::new(); + + // Step 1: Get Git's native blame hunks. + // Fast path for machine-only callers: skip file content hydration and range validation. + let all_blame_hunks = if fast_full_file_no_output { + self.blame_hunks_full(&relative_file_path, &options)? + } else { + // Read file content from one of: + // 1. Provided contents_data (from --contents flag) + // 2. A specific commit + // 3. The working directory + let total_lines = if let Some(ref data) = options.contents_data { + // Use pre-read contents data (from --contents stdin or file) + file_content = String::from_utf8_lossy(data).to_string(); + file_content.lines().count() as u32 + } else if let Some(ref commit) = options.newest_commit { + // Read file content from the specified commit + // This ensures blame is independent of which branch is checked out + let commit_obj = self.find_commit(commit.clone())?; + let tree = commit_obj.tree()?; + + match tree.get_path(std::path::Path::new(&relative_file_path)) { + Ok(entry) => { + if let Ok(blob) = self.find_blob(entry.id()) { + let blob_content = blob.content().unwrap_or_default(); + file_content = String::from_utf8_lossy(&blob_content).to_string(); + file_content.lines().count() as u32 + } else { + return Err(GitAiError::Generic(format!( + "File '{}' is not a blob in commit {}", + relative_file_path, commit + ))); + } + } + Err(_) => { return Err(GitAiError::Generic(format!( - "File '{}' is not a blob in commit {}", + "File '{}' not found in commit {}", relative_file_path, commit ))); } } - Err(_) => { + } else { + // Read from working directory (existing behavior) + let abs_file_path = repo_root.join(&relative_file_path); + + if !abs_file_path.exists() { return Err(GitAiError::Generic(format!( - "File '{}' not found in commit {}", - relative_file_path, commit + "File not found: {}", + abs_file_path.display() ))); } - } - } else { - // Read from working directory (existing behavior) - let abs_file_path = repo_root.join(&relative_file_path); - - if !abs_file_path.exists() { - return Err(GitAiError::Generic(format!( - "File not found: {}", - abs_file_path.display() - ))); - } - - let content = fs::read_to_string(&abs_file_path)?; - let lines_count = content.lines().count() as u32; - (content, lines_count) - }; - let lines: Vec<&str> = file_content.lines().collect(); + file_content = fs::read_to_string(&abs_file_path)?; + file_content.lines().count() as u32 + }; - // Determine the line ranges to process - let line_ranges = if options.line_ranges.is_empty() { - vec![(1, total_lines)] - } else { - options.line_ranges.clone() - }; + // Determine the line ranges to process + line_ranges = if options.line_ranges.is_empty() { + vec![(1, total_lines)] + } else { + options.line_ranges.clone() + }; - // Validate line ranges - for (start, end) in &line_ranges { - if *start == 0 || *end == 0 || start > end || *end > total_lines { - return Err(GitAiError::Generic(format!( - "Invalid line range: {}:{}. File has {} lines", - start, end, total_lines - ))); + // Validate line ranges + for (start, end) in &line_ranges { + if *start == 0 || *end == 0 || start > end || *end > total_lines { + return Err(GitAiError::Generic(format!( + "Invalid line range: {}:{}. File has {} lines", + start, end, total_lines + ))); + } } - } - // Step 1: Get Git's native blame for all ranges - let mut all_blame_hunks = Vec::new(); - for (start_line, end_line) in &line_ranges { - let hunks = self.blame_hunks(&relative_file_path, *start_line, *end_line, &options)?; - all_blame_hunks.extend(hunks); - } + let mut hunks = Vec::new(); + for (start_line, end_line) in &line_ranges { + hunks.extend(self.blame_hunks( + &relative_file_path, + *start_line, + *end_line, + &options, + )?); + } + hunks + }; // Step 2: Overlay AI authorship information let (line_authors, prompt_records, authorship_logs, prompt_commits) = @@ -355,6 +368,8 @@ impl Repository { return Ok((line_authors, prompt_records)); } + let lines: Vec<&str> = file_content.lines().collect(); + // Output based on format if options.json { output_json_format( @@ -402,6 +417,23 @@ impl Repository { start_line: u32, end_line: u32, options: &GitAiBlameOptions, + ) -> Result, GitAiError> { + self.blame_hunks_internal(file_path, Some((start_line, end_line)), options) + } + + fn blame_hunks_full( + &self, + file_path: &str, + options: &GitAiBlameOptions, + ) -> Result, GitAiError> { + self.blame_hunks_internal(file_path, None, options) + } + + fn blame_hunks_internal( + &self, + file_path: &str, + line_range: Option<(u32, u32)>, + options: &GitAiBlameOptions, ) -> Result, GitAiError> { // Build git blame --line-porcelain command let mut args = self.global_args_for_exec(); @@ -423,9 +455,11 @@ impl Repository { args.push(file.clone()); } - // Limit to specified range - args.push("-L".to_string()); - args.push(format!("{},{}", start_line, end_line)); + // Limit to specified range (or blame entire file when no range is provided) + if let Some((start_line, end_line)) = line_range { + args.push("-L".to_string()); + args.push(format!("{},{}", start_line, end_line)); + } // Add --since flag if oldest_date is specified // This controls the absolute lower bound of how far back to look @@ -684,10 +718,12 @@ impl Repository { }); } - // Post-process hunks to populate ai_human_author from authorship logs - let hunks = self.populate_ai_human_authors(hunks, file_path, options)?; - - Ok(hunks) + if options.populate_ai_human_authors { + // Post-process hunks to populate ai_human_author from authorship logs + self.populate_ai_human_authors(hunks, file_path, options) + } else { + Ok(hunks) + } } /// Post-process blame hunks to populate ai_human_author from authorship logs. @@ -815,8 +851,17 @@ fn overlay_ai_authorship( // Track which commits contain each prompt hash let mut prompt_commits: HashMap> = HashMap::new(); - // Group hunks by commit SHA to avoid repeated lookups - let mut commit_authorship_cache: HashMap> = HashMap::new(); + // Group hunks by commit SHA to avoid repeated lookups. + // Preload in batch to avoid N git notes lookups per file. + let mut unique_commit_shas: Vec = blame_hunks + .iter() + .map(|h| h.commit_sha.clone()) + .collect::>() + .into_iter() + .collect(); + unique_commit_shas.sort(); + let mut commit_authorship_cache: HashMap> = + preload_authorship_logs_for_commits(repo, &unique_commit_shas).unwrap_or_default(); // Cache for foreign prompts to avoid repeated grepping let mut foreign_prompts_cache: HashMap> = HashMap::new(); @@ -921,6 +966,126 @@ fn overlay_ai_authorship( )) } +fn preload_authorship_logs_for_commits( + repo: &Repository, + commit_shas: &[String], +) -> Result>, GitAiError> { + let mut out: HashMap> = HashMap::new(); + if commit_shas.is_empty() { + return Ok(out); + } + + let note_blob_map = note_blob_oids_for_commits(repo, commit_shas)?; + if note_blob_map.is_empty() { + for sha in commit_shas { + out.insert(sha.clone(), None); + } + return Ok(out); + } + + let mut unique_blob_oids: Vec = note_blob_map + .values() + .cloned() + .collect::>() + .into_iter() + .collect(); + unique_blob_oids.sort(); + + let blob_contents = batch_read_blobs_with_oids(repo, &unique_blob_oids)?; + + for commit_sha in commit_shas { + let authorship_log = note_blob_map + .get(commit_sha) + .and_then(|blob_oid| blob_contents.get(blob_oid)) + .and_then(|content| { + AuthorshipLog::deserialize_from_string(content) + .ok() + .and_then(|mut log| { + if log.metadata.schema_version != AUTHORSHIP_LOG_VERSION { + return None; + } + log.metadata.base_commit_sha = commit_sha.clone(); + Some(log) + }) + }); + out.insert(commit_sha.clone(), authorship_log); + } + + Ok(out) +} + +fn batch_read_blobs_with_oids( + repo: &Repository, + blob_oids: &[String], +) -> Result, GitAiError> { + if blob_oids.is_empty() { + return Ok(HashMap::new()); + } + + let mut args = repo.global_args_for_exec(); + args.push("cat-file".to_string()); + args.push("--batch".to_string()); + + let stdin_data = blob_oids.join("\n") + "\n"; + let output = exec_git_stdin(&args, stdin_data.as_bytes())?; + + parse_cat_file_batch_output_with_oids(&output.stdout) +} + +fn parse_cat_file_batch_output_with_oids( + data: &[u8], +) -> Result, GitAiError> { + let mut results = HashMap::new(); + let mut pos = 0usize; + + while pos < data.len() { + let header_end = match data[pos..].iter().position(|&b| b == b'\n') { + Some(idx) => pos + idx, + None => break, + }; + + let header = std::str::from_utf8(&data[pos..header_end])?; + let parts: Vec<&str> = header.split_whitespace().collect(); + if parts.len() < 2 { + pos = header_end + 1; + continue; + } + + let oid = parts[0].to_string(); + if parts[1] == "missing" { + pos = header_end + 1; + continue; + } + + if parts.len() < 3 { + pos = header_end + 1; + continue; + } + + let size: usize = parts[2] + .parse() + .map_err(|e| GitAiError::Generic(format!("Invalid size in cat-file output: {}", e)))?; + + let content_start = header_end + 1; + let content_end = content_start + size; + if content_end > data.len() { + return Err(GitAiError::Generic( + "Malformed cat-file --batch output: truncated content".to_string(), + )); + } + + let content = String::from_utf8_lossy(&data[content_start..content_end]).to_string(); + results.insert(oid, content); + + pos = content_end; + if pos < data.len() && data[pos] == b'\n' { + pos += 1; + } + } + + Ok(results) +} + /// JSON output structure for blame #[derive(Debug, Serialize)] struct JsonBlameOutput { diff --git a/src/commands/checkpoint.rs b/src/commands/checkpoint.rs index 1c3f2b031..e92437078 100644 --- a/src/commands/checkpoint.rs +++ b/src/commands/checkpoint.rs @@ -161,6 +161,22 @@ pub fn run( debug_log("No AI edits,in pre-commit checkpoint, skipping"); return Ok((0, 0, 0)); } + + // Fast path for squash commits: merge --squash prep already materialized INITIAL + // attributions. If the staged tree is unchanged since prep, running checkpoint + // does not add information and only burns time. + if has_no_ai_edits + && has_initial_attributions + && is_squash_commit_in_progress(repo) + && !Config::get().get_feature_flags().inter_commit_move + && let (Some(prepared_tree), Ok(current_tree)) = + (working_log.read_squash_tree_oid(), repo.index_tree_oid()) + && prepared_tree == current_tree + { + let _ = working_log.mark_squash_precommit_skipped(); + debug_log("Squash commit with unchanged staged tree; skipping pre-commit checkpoint"); + return Ok((0, 0, 0)); + } } // Set dirty files if available @@ -520,6 +536,10 @@ pub fn run( Ok((entries.len(), files.len(), checkpoints.len())) } +fn is_squash_commit_in_progress(repo: &Repository) -> bool { + repo.path().join("SQUASH_MSG").exists() +} + // Gets tracked changes AND fn get_status_of_files( repo: &Repository, @@ -1893,6 +1913,124 @@ mod tests { ); } + #[test] + fn test_pre_commit_squash_fast_path_skips_when_staged_tree_matches_marker() { + let (tmp_repo, mut file, _) = TmpRepo::new_with_base_commit().unwrap(); + file.append("squash staged line\n").unwrap(); + + let repo = tmp_repo.gitai_repo(); + let base_commit = repo + .head() + .ok() + .and_then(|head| head.target().ok()) + .unwrap_or_else(|| "initial".to_string()); + let working_log = repo.storage.working_log_for_base_commit(&base_commit); + + let mut initial_files = HashMap::new(); + initial_files.insert( + file.filename().to_string(), + vec![LineAttribution { + start_line: 2, + end_line: 2, + author_id: "mock_ai".to_string(), + overrode: None, + }], + ); + working_log + .write_initial_attributions(initial_files, HashMap::new()) + .unwrap(); + + let tree_oid = repo.index_tree_oid().unwrap(); + working_log.write_squash_tree_oid(&tree_oid).unwrap(); + std::fs::write(repo.path().join("SQUASH_MSG"), "squash in progress").unwrap(); + + let (entries_len, files_len, checkpoints_len) = run( + repo, + "test_user", + CheckpointKind::Human, + false, + false, + true, + None, + true, + ) + .unwrap(); + + assert_eq!(entries_len, 0, "fast path should skip checkpoint entries"); + assert_eq!(files_len, 0, "fast path should skip file processing"); + assert_eq!(checkpoints_len, 0, "fast path should not append checkpoints"); + assert!( + working_log.was_squash_precommit_skipped(), + "fast path should mark squash pre-commit as skipped" + ); + assert!( + working_log.read_all_checkpoints().unwrap().is_empty(), + "no checkpoints should be written on squash fast-path skip" + ); + } + + #[test] + fn test_pre_commit_squash_fast_path_does_not_skip_when_tree_marker_mismatches() { + let (tmp_repo, mut file, _) = TmpRepo::new_with_base_commit().unwrap(); + file.append("squash staged line\n").unwrap(); + + let repo = tmp_repo.gitai_repo(); + let base_commit = repo + .head() + .ok() + .and_then(|head| head.target().ok()) + .unwrap_or_else(|| "initial".to_string()); + let working_log = repo.storage.working_log_for_base_commit(&base_commit); + + let mut initial_files = HashMap::new(); + initial_files.insert( + file.filename().to_string(), + vec![LineAttribution { + start_line: 2, + end_line: 2, + author_id: "mock_ai".to_string(), + overrode: None, + }], + ); + working_log + .write_initial_attributions(initial_files, HashMap::new()) + .unwrap(); + + working_log + .write_squash_tree_oid("deadbeefdeadbeefdeadbeefdeadbeefdeadbeef") + .unwrap(); + std::fs::write(repo.path().join("SQUASH_MSG"), "squash in progress").unwrap(); + + let (entries_len, files_len, checkpoints_len) = run( + repo, + "test_user", + CheckpointKind::Human, + false, + false, + true, + None, + true, + ) + .unwrap(); + + assert!( + files_len > 0, + "tree mismatch should take normal pre-commit path and process files" + ); + assert!( + entries_len > 0, + "tree mismatch should produce checkpoint entries" + ); + assert!( + checkpoints_len > 0, + "tree mismatch should append a new checkpoint" + ); + assert!( + !working_log.was_squash_precommit_skipped(), + "tree mismatch must not mark squash pre-commit as skipped" + ); + } + #[test] fn test_compute_line_stats_ignores_whitespace_only_lines() { let (tmp_repo, _lines_file, _alphabet_file) = TmpRepo::new_with_base_commit().unwrap(); diff --git a/src/commands/hooks/commit_hooks.rs b/src/commands/hooks/commit_hooks.rs index 0fc650c5c..4b2759060 100644 --- a/src/commands/hooks/commit_hooks.rs +++ b/src/commands/hooks/commit_hooks.rs @@ -4,6 +4,7 @@ use crate::git::cli_parser::{ParsedGitInvocation, is_dry_run}; use crate::git::repository::Repository; use crate::git::rewrite_log::RewriteLogEvent; use crate::utils::debug_log; +use std::time::Instant; pub fn commit_pre_command_hook( parsed_args: &ParsedGitInvocation, @@ -40,6 +41,7 @@ pub fn commit_post_command_hook( repository: &mut Repository, command_hooks_context: &mut CommandHooksContext, ) { + let hook_start = Instant::now(); if is_dry_run(&parsed_args.command_args) { return; } @@ -69,6 +71,12 @@ pub fn commit_post_command_hook( } let commit_author = get_commit_default_author(repository, &parsed_args.command_args); + debug_log(&format!( + "[BENCHMARK] commit post-hook: setup before rewrite event took {:?}", + hook_start.elapsed() + )); + + let rewrite_start = Instant::now(); if parsed_args.has_command_flag("--amend") { if let (Some(orig), Some(sha)) = (original_commit.clone(), new_sha.clone()) { repository.handle_rewrite_log_event( @@ -93,9 +101,22 @@ pub fn commit_post_command_hook( true, ); } + debug_log(&format!( + "[BENCHMARK] commit post-hook: handle_rewrite_log_event took {:?}", + rewrite_start.elapsed() + )); // Flush logs and metrics after commit + let flush_start = Instant::now(); crate::observability::spawn_background_flush(); + debug_log(&format!( + "[BENCHMARK] commit post-hook: spawn_background_flush took {:?}", + flush_start.elapsed() + )); + debug_log(&format!( + "[BENCHMARK] commit post-hook: total hook duration {:?}", + hook_start.elapsed() + )); } pub fn get_commit_default_author(repo: &Repository, args: &[String]) -> String { diff --git a/src/git/authorship_traversal.rs b/src/git/authorship_traversal.rs index 0856f5a3f..dc1ce1de4 100644 --- a/src/git/authorship_traversal.rs +++ b/src/git/authorship_traversal.rs @@ -1,6 +1,5 @@ use std::collections::HashSet; -use crate::authorship::authorship_log_serialization::AuthorshipLog; use crate::error::GitAiError; use crate::git::refs::{commits_with_authorship_notes, note_blob_oids_for_commits}; #[cfg(test)] @@ -164,19 +163,24 @@ fn parse_cat_file_batch_output_with_oids( /// Extract file paths from a note blob content fn extract_file_paths_from_note(content: &str, files: &mut HashSet) { - // Find the divider and slice before it, then add minimal metadata to make it parseable - if let Some(divider_pos) = content.find("\n---\n") { - let attestation_section = &content[..divider_pos]; - // Create a complete parseable format with empty metadata - let parseable = format!( - "{}\n---\n{{\"schema_version\":\"authorship/3.0.0\",\"base_commit_sha\":\"\",\"prompts\":{{}}}}", - attestation_section - ); + // Only parse the attestation header section (before divider). + // File paths are unindented lines; attestation entries are indented by two spaces. + for line in content.lines() { + if line == "---" { + break; + } + if line.is_empty() || line.starts_with(" ") { + continue; + } - if let Ok(log) = AuthorshipLog::deserialize_from_string(&parseable) { - for attestation in log.attestations { - files.insert(attestation.file_path); - } + let file_path = if line.starts_with('"') && line.ends_with('"') && line.len() >= 2 { + line[1..line.len() - 1].to_string() + } else { + line.to_string() + }; + + if !file_path.is_empty() { + files.insert(file_path); } } } @@ -185,8 +189,59 @@ fn extract_file_paths_from_note(content: &str, files: &mut HashSet) { mod tests { use super::*; use crate::git::{find_repository_in_path, sync_authorship::fetch_authorship_notes}; + use std::collections::HashSet; use std::time::Instant; + #[test] + fn test_extract_file_paths_from_note_handles_quotes_and_ignores_entries() { + let note = r#"src/main.rs +"path with spaces.txt" + [1, 3] 82a0dd96f0f8d051 + [8, 9] human +--- +{"schema_version":"authorship/3.0.0","base_commit_sha":"abc","prompts":{}} +"#; + + let mut files = HashSet::new(); + extract_file_paths_from_note(note, &mut files); + + assert!(files.contains("src/main.rs")); + assert!(files.contains("path with spaces.txt")); + assert_eq!( + files.len(), + 2, + "indented attestation lines and metadata should be ignored" + ); + } + + #[test] + fn test_parse_cat_file_batch_output_with_oids_handles_content_and_missing() { + let data = b"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa blob 6\nx\ny\nz\nbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb missing\n"; + let parsed = parse_cat_file_batch_output_with_oids(data).expect("parse batch output"); + + assert_eq!( + parsed + .get("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa") + .map(String::as_str), + Some("x\ny\nz\n") + ); + assert!( + !parsed.contains_key("bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"), + "missing objects should be skipped" + ); + } + + #[test] + fn test_parse_cat_file_batch_output_with_oids_errors_on_truncated_content() { + let truncated = b"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa blob 8\nabc"; + let err = parse_cat_file_batch_output_with_oids(truncated).expect_err("should fail"); + assert!( + err.to_string().contains("truncated"), + "unexpected error: {}", + err + ); + } + #[test] fn test_load_ai_touched_files_for_specific_commits() { smol::block_on(async { diff --git a/src/git/repo_storage.rs b/src/git/repo_storage.rs index a24b1eea1..b2b169086 100644 --- a/src/git/repo_storage.rs +++ b/src/git/repo_storage.rs @@ -135,6 +135,12 @@ impl RepoStorage { self.read_rewrite_events() } + /// Append a rewrite event to the rewrite log file without reading/parsing the full log. + /// Hot command hooks should use this to avoid O(log-size) overhead on every command. + pub fn append_rewrite_event_fast(&self, event: RewriteLogEvent) -> Result<(), GitAiError> { + append_event_to_file(&self.rewrite_log, event) + } + /// Read all rewrite events from the rewrite log file pub fn read_rewrite_events(&self) -> Result, GitAiError> { if !self.rewrite_log.exists() { @@ -158,6 +164,8 @@ pub struct PersistedWorkingLog { pub canonical_workdir: PathBuf, pub dirty_files: Option>, pub initial_file: PathBuf, + pub squash_tree_file: PathBuf, + pub squash_skip_checkpoint_file: PathBuf, } impl PersistedWorkingLog { @@ -169,6 +177,8 @@ impl PersistedWorkingLog { dirty_files: Option>, ) -> Self { let initial_file = dir.join("INITIAL"); + let squash_tree_file = dir.join("SQUASH_TREE"); + let squash_skip_checkpoint_file = dir.join("SQUASH_SKIP_CHECKPOINT"); Self { dir, base_commit: base_commit.to_string(), @@ -176,6 +186,8 @@ impl PersistedWorkingLog { canonical_workdir, dirty_files, initial_file, + squash_tree_file, + squash_skip_checkpoint_file, } } @@ -210,6 +222,14 @@ impl PersistedWorkingLog { fs::remove_file(&self.initial_file)?; } + // Clear squash staged-tree marker if present. + if self.squash_tree_file.exists() { + fs::remove_file(&self.squash_tree_file)?; + } + if self.squash_skip_checkpoint_file.exists() { + fs::remove_file(&self.squash_skip_checkpoint_file)?; + } + Ok(()) } @@ -594,6 +614,35 @@ impl PersistedWorkingLog { } } } + + /// Persist the staged index tree OID captured during squash prep. + pub fn write_squash_tree_oid(&self, tree_oid: &str) -> Result<(), GitAiError> { + fs::write(&self.squash_tree_file, tree_oid.trim())?; + Ok(()) + } + + /// Read the staged index tree OID captured during squash prep. + pub fn read_squash_tree_oid(&self) -> Option { + if !self.squash_tree_file.exists() { + return None; + } + + fs::read_to_string(&self.squash_tree_file) + .ok() + .map(|s| s.trim().to_string()) + .filter(|s| !s.is_empty()) + } + + /// Mark that pre-commit checkpoint was intentionally skipped for a squash commit. + pub fn mark_squash_precommit_skipped(&self) -> Result<(), GitAiError> { + fs::write(&self.squash_skip_checkpoint_file, "1")?; + Ok(()) + } + + /// Returns true when pre-commit checkpoint was intentionally skipped for squash. + pub fn was_squash_precommit_skipped(&self) -> bool { + self.squash_skip_checkpoint_file.exists() + } } #[cfg(test)] @@ -931,4 +980,70 @@ mod tests { "Working log directory should be in correct location" ); } + + #[test] + fn test_squash_marker_roundtrip_and_skip_flag() { + let tmp_repo = TmpRepo::new().expect("Failed to create tmp repo"); + let repo_storage = + RepoStorage::for_repo_path(tmp_repo.repo().path(), tmp_repo.repo().workdir().unwrap()); + let working_log = repo_storage.working_log_for_base_commit("test-commit-sha"); + + assert_eq!( + working_log.read_squash_tree_oid(), + None, + "tree marker should be absent before write" + ); + assert!( + !working_log.was_squash_precommit_skipped(), + "skip marker should be absent before write" + ); + + working_log + .write_squash_tree_oid(" abcdef1234567890 \n") + .expect("write squash tree marker"); + assert_eq!( + working_log.read_squash_tree_oid().as_deref(), + Some("abcdef1234567890"), + "tree marker should be trimmed on read" + ); + + working_log + .mark_squash_precommit_skipped() + .expect("mark squash pre-commit skipped"); + assert!( + working_log.was_squash_precommit_skipped(), + "skip marker should be persisted" + ); + } + + #[test] + fn test_reset_working_log_clears_squash_markers() { + let tmp_repo = TmpRepo::new().expect("Failed to create tmp repo"); + let repo_storage = + RepoStorage::for_repo_path(tmp_repo.repo().path(), tmp_repo.repo().workdir().unwrap()); + let working_log = repo_storage.working_log_for_base_commit("test-commit-sha"); + + working_log + .write_squash_tree_oid("abcdef1234567890") + .expect("write squash tree marker"); + working_log + .mark_squash_precommit_skipped() + .expect("mark squash pre-commit skipped"); + assert!(working_log.squash_tree_file.exists()); + assert!(working_log.squash_skip_checkpoint_file.exists()); + + working_log + .reset_working_log() + .expect("reset working log should succeed"); + + assert_eq!( + working_log.read_squash_tree_oid(), + None, + "reset should clear tree marker" + ); + assert!( + !working_log.was_squash_precommit_skipped(), + "reset should clear skip marker" + ); + } } diff --git a/src/git/repository.rs b/src/git/repository.rs index 58988234c..f8a12e3d7 100644 --- a/src/git/repository.rs +++ b/src/git/repository.rs @@ -10,12 +10,14 @@ use crate::git::rewrite_log::RewriteLogEvent; use crate::git::status::MAX_PATHSPEC_ARGS; use crate::git::sync_authorship::{fetch_authorship_notes, push_authorship_notes}; use crate::utils::GIT_AI_SKIP_CORE_HOOKS_ENV; +use crate::utils::debug_log; #[cfg(windows)] use crate::utils::is_interactive_terminal; use std::collections::{HashMap, HashSet}; use std::path::{Path, PathBuf}; use std::process::{Command, Output}; +use std::time::Instant; #[cfg(windows)] use crate::utils::CREATE_NO_WINDOW; @@ -906,20 +908,28 @@ impl Repository { supress_output: bool, apply_side_effects: bool, ) { - let log = self - .storage - .append_rewrite_event(rewrite_log_event.clone()) + let append_start = Instant::now(); + self.storage + .append_rewrite_event_fast(rewrite_log_event.clone()) .expect("Error writing .git/ai/rewrite_log"); - - if apply_side_effects - && let Ok(_) = rewrite_authorship_if_needed( + debug_log(&format!( + "[BENCHMARK] rewrite_log: append event took {:?}", + append_start.elapsed() + )); + + if apply_side_effects { + let rewrite_start = Instant::now(); + let _ = rewrite_authorship_if_needed( self, &rewrite_log_event, commit_author, - &log, supress_output, - ) - {} + ); + debug_log(&format!( + "[BENCHMARK] rewrite_log: apply side effects took {:?}", + rewrite_start.elapsed() + )); + } } // Internal util to get the git object type for a given OID @@ -1629,6 +1639,82 @@ impl Repository { Ok(output.stdout) } + /// Get UTF-8 content for many files at a specific commit using one batched cat-file call. + /// Files that are missing at the commit or are not valid UTF-8 are skipped. + pub fn get_files_content_at_commit( + &self, + commit_sha: &str, + file_paths: &[String], + ) -> Result, GitAiError> { + if file_paths.is_empty() { + return Ok(HashMap::new()); + } + + let mut args = self.global_args_for_exec(); + args.push("cat-file".to_string()); + args.push("--batch".to_string()); + + let mut stdin_data = String::new(); + for file_path in file_paths { + stdin_data.push_str(commit_sha); + stdin_data.push(':'); + stdin_data.push_str(file_path); + stdin_data.push('\n'); + } + + let output = exec_git_stdin(&args, stdin_data.as_bytes())?; + let data = output.stdout; + let mut pos = 0usize; + let mut contents = HashMap::new(); + + for file_path in file_paths { + let header_end = match data[pos..].iter().position(|&b| b == b'\n') { + Some(idx) => pos + idx, + None => break, + }; + let header = std::str::from_utf8(&data[pos..header_end])?; + pos = header_end + 1; + + if header.ends_with(" missing") { + continue; + } + + let mut header_parts = header.rsplitn(3, ' '); + let size_str = header_parts + .next() + .ok_or_else(|| GitAiError::Generic("Malformed cat-file header".to_string()))?; + let object_type = header_parts + .next() + .ok_or_else(|| GitAiError::Generic("Malformed cat-file header".to_string()))?; + + if object_type != "blob" { + continue; + } + + let size: usize = size_str + .parse() + .map_err(|e| GitAiError::Generic(format!("Invalid cat-file blob size: {}", e)))?; + + let content_end = pos + size; + if content_end > data.len() { + return Err(GitAiError::Generic( + "Malformed cat-file --batch output: truncated content".to_string(), + )); + } + + if let Ok(content) = String::from_utf8(data[pos..content_end].to_vec()) { + contents.insert(file_path.clone(), content); + } + + pos = content_end; + if pos < data.len() && data[pos] == b'\n' { + pos += 1; + } + } + + Ok(contents) + } + /// Get content of all staged files concurrently /// Returns a HashMap of file paths to their staged content as strings /// Skips files that fail to read or aren't valid UTF-8 @@ -1636,40 +1722,68 @@ impl Repository { &self, file_paths: &[String], ) -> Result, GitAiError> { - use futures::future::join_all; - use std::sync::Arc; - - const MAX_CONCURRENT: usize = 30; - - let repo_global_args = self.global_args_for_exec(); - let semaphore = Arc::new(smol::lock::Semaphore::new(MAX_CONCURRENT)); - - let futures: Vec<_> = file_paths - .iter() - .map(|file_path| { - let mut args = repo_global_args.clone(); - args.push("show".to_string()); - args.push(format!(":{}", file_path)); - let file_path = file_path.clone(); - let semaphore = semaphore.clone(); - - async move { - let _permit = semaphore.acquire().await; - let result = exec_git(&args).and_then(|output| { - String::from_utf8(output.stdout) - .map_err(|e| GitAiError::Utf8Error(e.utf8_error())) - }); - (file_path, result) - } - }) - .collect(); - - let results = smol::block_on(async { join_all(futures).await }); + if file_paths.is_empty() { + return Ok(HashMap::new()); + } let mut staged_files = HashMap::new(); - for (file_path, result) in results { - if let Ok(content) = result { - staged_files.insert(file_path, content); + + let mut args = self.global_args_for_exec(); + args.push("cat-file".to_string()); + args.push("--batch".to_string()); + + let mut stdin_data = String::new(); + for file_path in file_paths { + stdin_data.push(':'); + stdin_data.push_str(file_path); + stdin_data.push('\n'); + } + + let output = exec_git_stdin(&args, stdin_data.as_bytes())?; + let data = output.stdout; + + let mut pos = 0usize; + for file_path in file_paths { + let header_end = match data[pos..].iter().position(|&b| b == b'\n') { + Some(idx) => pos + idx, + None => break, + }; + let header = std::str::from_utf8(&data[pos..header_end])?; + pos = header_end + 1; + + if header.ends_with(" missing") { + continue; + } + + let mut header_parts = header.rsplitn(3, ' '); + let size_str = header_parts + .next() + .ok_or_else(|| GitAiError::Generic("Malformed cat-file header".to_string()))?; + let object_type = header_parts + .next() + .ok_or_else(|| GitAiError::Generic("Malformed cat-file header".to_string()))?; + + if object_type != "blob" { + continue; + } + + let size: usize = size_str + .parse() + .map_err(|e| GitAiError::Generic(format!("Invalid cat-file blob size: {}", e)))?; + let content_end = pos + size; + if content_end > data.len() { + return Err(GitAiError::Generic( + "Malformed cat-file --batch output: truncated content".to_string(), + )); + } + + if let Ok(content) = String::from_utf8(data[pos..content_end].to_vec()) { + staged_files.insert(file_path.clone(), content); + } + + pos = content_end; + if pos < data.len() && data[pos] == b'\n' { + pos += 1; } } @@ -1738,6 +1852,15 @@ impl Repository { Ok(files) } + /// Materialize the current index as a tree object and return its OID. + /// Useful for detecting whether staged content changed between hook phases. + pub fn index_tree_oid(&self) -> Result { + let mut args = self.global_args_for_exec(); + args.push("write-tree".to_string()); + let output = exec_git(&args)?; + Ok(String::from_utf8(output.stdout)?.trim().to_string()) + } + /// Get added line ranges from git diff between two commits /// Returns a HashMap of file paths to vectors of added line numbers /// @@ -2775,6 +2898,67 @@ index 0000000..abc1234 100644 ); } + #[test] + fn test_get_files_content_at_commit_reads_multiple_files_and_skips_missing() { + use crate::git::test_utils::TmpRepo; + + let tmp_repo = TmpRepo::new().unwrap(); + tmp_repo.write_file("a.txt", "a1\n", true).unwrap(); + tmp_repo.write_file("b.txt", "b1\n", true).unwrap(); + tmp_repo.commit_with_message("base").unwrap(); + + let commit_sha = tmp_repo.head_commit_sha().unwrap(); + let repo = tmp_repo.gitai_repo(); + let files = vec![ + "a.txt".to_string(), + "b.txt".to_string(), + "missing.txt".to_string(), + ]; + let contents = repo.get_files_content_at_commit(&commit_sha, &files).unwrap(); + + assert_eq!(contents.get("a.txt").map(String::as_str), Some("a1\n")); + assert_eq!(contents.get("b.txt").map(String::as_str), Some("b1\n")); + assert!( + !contents.contains_key("missing.txt"), + "missing paths should be skipped" + ); + } + + #[test] + fn test_get_all_staged_files_content_reads_batched_index_state() { + use crate::git::test_utils::TmpRepo; + + let tmp_repo = TmpRepo::new().unwrap(); + tmp_repo.write_file("tracked.txt", "base\n", true).unwrap(); + tmp_repo.commit_with_message("base").unwrap(); + + tmp_repo + .write_file("tracked.txt", "base\nstaged update\n", true) + .unwrap(); + tmp_repo.write_file("new.txt", "new staged file\n", true).unwrap(); + + let repo = tmp_repo.gitai_repo(); + let files = vec![ + "tracked.txt".to_string(), + "new.txt".to_string(), + "missing.txt".to_string(), + ]; + let staged = repo.get_all_staged_files_content(&files).unwrap(); + + assert_eq!( + staged.get("tracked.txt").map(String::as_str), + Some("base\nstaged update\n") + ); + assert_eq!( + staged.get("new.txt").map(String::as_str), + Some("new staged file\n") + ); + assert!( + !staged.contains_key("missing.txt"), + "missing staged paths should be skipped" + ); + } + #[test] fn find_repository_in_path_bare_repo_can_read_head_gitattributes() { let temp = tempfile::tempdir().expect("tempdir"); @@ -2810,4 +2994,27 @@ index 0000000..abc1234 100644 let content = String::from_utf8(content).expect("utf8 attrs"); assert!(content.contains("generated/** linguist-generated=true")); } + + #[test] + fn test_index_tree_oid_changes_with_staged_content() { + use crate::git::test_utils::TmpRepo; + + let tmp_repo = TmpRepo::new().unwrap(); + tmp_repo.write_file("tracked.txt", "base\n", true).unwrap(); + tmp_repo.commit_with_message("base").unwrap(); + + let repo = tmp_repo.gitai_repo(); + let before = repo.index_tree_oid().unwrap(); + assert!(!before.is_empty(), "tree oid should not be empty"); + + tmp_repo + .write_file("tracked.txt", "base\nstaged update\n", true) + .unwrap(); + let after = repo.index_tree_oid().unwrap(); + assert!(!after.is_empty(), "tree oid should not be empty after staging"); + assert_ne!( + before, after, + "staging file content changes should produce a different index tree OID" + ); + } } diff --git a/tests/squash_merge.rs b/tests/squash_merge.rs index 7d6c31c73..41aac9305 100644 --- a/tests/squash_merge.rs +++ b/tests/squash_merge.rs @@ -293,3 +293,129 @@ fn test_prepare_working_log_squash_with_mixed_additions() { "Sum of accepted_lines across prompts should match ai_accepted stat" ); } + +/// Human-only squash merges should not synthesize AI attestations/prompts. +#[test] +fn test_prepare_working_log_squash_human_only_fast_path() { + let repo = TestRepo::new(); + let mut file = repo.filename("human_only.txt"); + + file.set_contents(lines!["base line"]); + repo.stage_all_and_commit("Initial commit").unwrap(); + let default_branch = repo.current_branch(); + + repo.git(&["checkout", "-b", "feature"]).unwrap(); + file.insert_at(1, lines!["human feature line"]); + repo.stage_all_and_commit("Human-only feature change") + .unwrap(); + + repo.git(&["checkout", &default_branch]).unwrap(); + repo.git(&["merge", "--squash", "feature"]).unwrap(); + let squash_commit = repo.commit("Squash human-only feature").unwrap(); + + file.assert_lines_and_blame(lines!["base line".human(), "human feature line".human()]); + assert!( + squash_commit.authorship_log.attestations.is_empty(), + "No AI attestations expected for human-only squash" + ); + assert!( + squash_commit.authorship_log.metadata.prompts.is_empty(), + "No AI prompts expected for human-only squash" + ); +} + +/// Unrelated AI churn on the target branch should not pollute squash metadata. +#[test] +fn test_prepare_working_log_squash_ignores_unrelated_target_ai_files() { + let repo = TestRepo::new(); + let mut feature_file = repo.filename("feature.txt"); + let mut target_file = repo.filename("target.txt"); + + feature_file.set_contents(lines!["feature base"]); + target_file.set_contents(lines!["target base"]); + repo.stage_all_and_commit("Initial base").unwrap(); + let default_branch = repo.current_branch(); + + // Feature branch adds AI content in feature.txt + repo.git(&["checkout", "-b", "feature"]).unwrap(); + feature_file.insert_at(1, lines!["feature ai line".ai()]); + repo.stage_all_and_commit("Feature AI change").unwrap(); + + // Target branch adds unrelated AI content in target.txt + repo.git(&["checkout", &default_branch]).unwrap(); + target_file.insert_at(1, lines!["target ai line".ai()]); + repo.stage_all_and_commit("Target AI churn").unwrap(); + + // Squash only feature branch changes into target + repo.git(&["merge", "--squash", "feature"]).unwrap(); + let squash_commit = repo.commit("Squash feature into target").unwrap(); + + feature_file.assert_lines_and_blame(lines!["feature base".human(), "feature ai line".ai()]); + target_file.assert_lines_and_blame(lines!["target base".human(), "target ai line".ai()]); + + // Squash commit should only carry new AI attestation for feature.txt. + assert_eq!( + squash_commit.authorship_log.attestations.len(), + 1, + "Only feature.txt should be attested in squash commit" + ); + assert_eq!( + squash_commit.authorship_log.attestations[0].file_path, "feature.txt", + "Unrelated target AI file should not be included in squash attestations" + ); + assert_eq!( + squash_commit.authorship_log.metadata.prompts.len(), + 1, + "Only feature prompt should be carried into squash commit metadata" + ); + + let stats = repo.stats().unwrap(); + assert_eq!( + stats.ai_additions, 1, + "Only one new AI line should be counted" + ); + assert_eq!( + stats.ai_accepted, 1, + "Only the feature branch AI line should be accepted in squash commit" + ); +} + +/// Small staged-set fast path should not introduce AI attestations for human-only files. +#[test] +fn test_prepare_working_log_squash_small_staged_bypass_keeps_human_files_clean() { + let repo = TestRepo::new(); + let mut ai_file = repo.filename("ai_file.txt"); + let mut human_file = repo.filename("human_file.txt"); + + ai_file.set_contents(lines!["ai base"]); + human_file.set_contents(lines!["human base"]); + repo.stage_all_and_commit("Initial base").unwrap(); + let default_branch = repo.current_branch(); + + repo.git(&["checkout", "-b", "feature"]).unwrap(); + ai_file.insert_at(1, lines!["ai feature line".ai()]); + human_file.insert_at(1, lines!["human feature line"]); + repo.stage_all_and_commit("Mixed feature commit").unwrap(); + + repo.git(&["checkout", &default_branch]).unwrap(); + repo.git(&["merge", "--squash", "feature"]).unwrap(); + let squash_commit = repo.commit("Squash mixed feature").unwrap(); + + ai_file.assert_lines_and_blame(lines!["ai base".human(), "ai feature line".ai()]); + human_file.assert_lines_and_blame(lines!["human base".human(), "human feature line".human()]); + + assert_eq!( + squash_commit.authorship_log.attestations.len(), + 1, + "Human-only file should not receive AI attestations" + ); + assert_eq!( + squash_commit.authorship_log.attestations[0].file_path, + "ai_file.txt" + ); + assert_eq!( + squash_commit.authorship_log.metadata.prompts.len(), + 1, + "Only AI prompt metadata should be carried into squash commit" + ); +} From e9b356c9ff54bef2d65e6ec92400ee2399d98d27 Mon Sep 17 00:00:00 2001 From: Sasha Varlamov Date: Fri, 13 Feb 2026 16:07:54 -0500 Subject: [PATCH 2/3] Fix formatting after main rebase --- src/authorship/post_commit.rs | 8 ++------ src/authorship/rebase_authorship.rs | 6 +++--- src/commands/checkpoint.rs | 5 ++++- src/git/repository.rs | 13 ++++++++++--- 4 files changed, 19 insertions(+), 13 deletions(-) diff --git a/src/authorship/post_commit.rs b/src/authorship/post_commit.rs index 15754b4b8..a362a254f 100644 --- a/src/authorship/post_commit.rs +++ b/src/authorship/post_commit.rs @@ -840,12 +840,8 @@ mod tests { "human".to_string(), vec![], ); - let empty_entry = WorkingLogEntry::new( - "file.txt".to_string(), - "blob".to_string(), - vec![], - vec![], - ); + let empty_entry = + WorkingLogEntry::new("file.txt".to_string(), "blob".to_string(), vec![], vec![]); assert!( !checkpoint_entry_requires_post_processing(&empty_human_checkpoint, &empty_entry), "human entry with no AI attribution should be skipped" diff --git a/src/authorship/rebase_authorship.rs b/src/authorship/rebase_authorship.rs index 429478870..9ba3dd01a 100644 --- a/src/authorship/rebase_authorship.rs +++ b/src/authorship/rebase_authorship.rs @@ -2936,9 +2936,9 @@ fn transform_attributions_to_final_state( mod tests { use super::{ collect_changed_file_contents_from_diff, compress_line_attributions, - get_pathspecs_from_commits, - parse_cat_file_batch_output_with_oids, transform_attributions_to_final_state, - try_fast_path_rebase_note_remap, walk_commits_to_base, + get_pathspecs_from_commits, parse_cat_file_batch_output_with_oids, + transform_attributions_to_final_state, try_fast_path_rebase_note_remap, + walk_commits_to_base, }; use crate::authorship::attribution_tracker::{Attribution, LineAttribution}; use crate::authorship::authorship_log::LineRange; diff --git a/src/commands/checkpoint.rs b/src/commands/checkpoint.rs index e92437078..98428f09f 100644 --- a/src/commands/checkpoint.rs +++ b/src/commands/checkpoint.rs @@ -1958,7 +1958,10 @@ mod tests { assert_eq!(entries_len, 0, "fast path should skip checkpoint entries"); assert_eq!(files_len, 0, "fast path should skip file processing"); - assert_eq!(checkpoints_len, 0, "fast path should not append checkpoints"); + assert_eq!( + checkpoints_len, 0, + "fast path should not append checkpoints" + ); assert!( working_log.was_squash_precommit_skipped(), "fast path should mark squash pre-commit as skipped" diff --git a/src/git/repository.rs b/src/git/repository.rs index f8a12e3d7..f8f9335ab 100644 --- a/src/git/repository.rs +++ b/src/git/repository.rs @@ -2914,7 +2914,9 @@ index 0000000..abc1234 100644 "b.txt".to_string(), "missing.txt".to_string(), ]; - let contents = repo.get_files_content_at_commit(&commit_sha, &files).unwrap(); + let contents = repo + .get_files_content_at_commit(&commit_sha, &files) + .unwrap(); assert_eq!(contents.get("a.txt").map(String::as_str), Some("a1\n")); assert_eq!(contents.get("b.txt").map(String::as_str), Some("b1\n")); @@ -2935,7 +2937,9 @@ index 0000000..abc1234 100644 tmp_repo .write_file("tracked.txt", "base\nstaged update\n", true) .unwrap(); - tmp_repo.write_file("new.txt", "new staged file\n", true).unwrap(); + tmp_repo + .write_file("new.txt", "new staged file\n", true) + .unwrap(); let repo = tmp_repo.gitai_repo(); let files = vec![ @@ -3011,7 +3015,10 @@ index 0000000..abc1234 100644 .write_file("tracked.txt", "base\nstaged update\n", true) .unwrap(); let after = repo.index_tree_oid().unwrap(); - assert!(!after.is_empty(), "tree oid should not be empty after staging"); + assert!( + !after.is_empty(), + "tree oid should not be empty after staging" + ); assert_ne!( before, after, "staging file content changes should produce a different index tree OID" From 313ae9f621ccb9daf2ffc1b88607930895a62ea5 Mon Sep 17 00:00:00 2001 From: Sasha Varlamov Date: Fri, 13 Feb 2026 17:04:52 -0500 Subject: [PATCH 3/3] Fix batched cat-file parsing for non-blob entries --- src/git/repository.rs | 239 ++++++++++++++++++++++++------------------ 1 file changed, 138 insertions(+), 101 deletions(-) diff --git a/src/git/repository.rs b/src/git/repository.rs index f8f9335ab..5dea8942f 100644 --- a/src/git/repository.rs +++ b/src/git/repository.rs @@ -1663,59 +1663,10 @@ impl Repository { } let output = exec_git_stdin(&args, stdin_data.as_bytes())?; - let data = output.stdout; - let mut pos = 0usize; - let mut contents = HashMap::new(); - - for file_path in file_paths { - let header_end = match data[pos..].iter().position(|&b| b == b'\n') { - Some(idx) => pos + idx, - None => break, - }; - let header = std::str::from_utf8(&data[pos..header_end])?; - pos = header_end + 1; - - if header.ends_with(" missing") { - continue; - } - - let mut header_parts = header.rsplitn(3, ' '); - let size_str = header_parts - .next() - .ok_or_else(|| GitAiError::Generic("Malformed cat-file header".to_string()))?; - let object_type = header_parts - .next() - .ok_or_else(|| GitAiError::Generic("Malformed cat-file header".to_string()))?; - - if object_type != "blob" { - continue; - } - - let size: usize = size_str - .parse() - .map_err(|e| GitAiError::Generic(format!("Invalid cat-file blob size: {}", e)))?; - - let content_end = pos + size; - if content_end > data.len() { - return Err(GitAiError::Generic( - "Malformed cat-file --batch output: truncated content".to_string(), - )); - } - - if let Ok(content) = String::from_utf8(data[pos..content_end].to_vec()) { - contents.insert(file_path.clone(), content); - } - - pos = content_end; - if pos < data.len() && data[pos] == b'\n' { - pos += 1; - } - } - - Ok(contents) + parse_batch_blob_content_by_path(&output.stdout, file_paths) } - /// Get content of all staged files concurrently + /// Get UTF-8 content for many staged files with one batched cat-file call. /// Returns a HashMap of file paths to their staged content as strings /// Skips files that fail to read or aren't valid UTF-8 pub fn get_all_staged_files_content( @@ -1726,8 +1677,6 @@ impl Repository { return Ok(HashMap::new()); } - let mut staged_files = HashMap::new(); - let mut args = self.global_args_for_exec(); args.push("cat-file".to_string()); args.push("--batch".to_string()); @@ -1740,54 +1689,7 @@ impl Repository { } let output = exec_git_stdin(&args, stdin_data.as_bytes())?; - let data = output.stdout; - - let mut pos = 0usize; - for file_path in file_paths { - let header_end = match data[pos..].iter().position(|&b| b == b'\n') { - Some(idx) => pos + idx, - None => break, - }; - let header = std::str::from_utf8(&data[pos..header_end])?; - pos = header_end + 1; - - if header.ends_with(" missing") { - continue; - } - - let mut header_parts = header.rsplitn(3, ' '); - let size_str = header_parts - .next() - .ok_or_else(|| GitAiError::Generic("Malformed cat-file header".to_string()))?; - let object_type = header_parts - .next() - .ok_or_else(|| GitAiError::Generic("Malformed cat-file header".to_string()))?; - - if object_type != "blob" { - continue; - } - - let size: usize = size_str - .parse() - .map_err(|e| GitAiError::Generic(format!("Invalid cat-file blob size: {}", e)))?; - let content_end = pos + size; - if content_end > data.len() { - return Err(GitAiError::Generic( - "Malformed cat-file --batch output: truncated content".to_string(), - )); - } - - if let Ok(content) = String::from_utf8(data[pos..content_end].to_vec()) { - staged_files.insert(file_path.clone(), content); - } - - pos = content_end; - if pos < data.len() && data[pos] == b'\n' { - pos += 1; - } - } - - Ok(staged_files) + parse_batch_blob_content_by_path(&output.stdout, file_paths) } /// List all files changed in a commit @@ -2216,6 +2118,67 @@ pub fn find_repository_in_path(path: &str) -> Result { find_repository(&global_args) } +fn parse_batch_blob_content_by_path( + data: &[u8], + file_paths: &[String], +) -> Result, GitAiError> { + let mut pos = 0usize; + let mut contents = HashMap::new(); + + for file_path in file_paths { + if pos >= data.len() { + break; + } + + let header_end = match data[pos..].iter().position(|&b| b == b'\n') { + Some(idx) => pos + idx, + None => break, + }; + + let header = std::str::from_utf8(&data[pos..header_end])?; + pos = header_end + 1; + + if header.ends_with(" missing") { + continue; + } + + let mut header_parts = header.rsplitn(3, ' '); + let size_str = header_parts + .next() + .ok_or_else(|| GitAiError::Generic("Malformed cat-file header".to_string()))?; + let object_type = header_parts + .next() + .ok_or_else(|| GitAiError::Generic("Malformed cat-file header".to_string()))?; + let size: usize = size_str + .parse() + .map_err(|e| GitAiError::Generic(format!("Invalid cat-file blob size: {}", e)))?; + + let content_end = pos.checked_add(size).ok_or_else(|| { + GitAiError::Generic("Malformed cat-file --batch output: content overflow".to_string()) + })?; + if content_end > data.len() { + return Err(GitAiError::Generic( + "Malformed cat-file --batch output: truncated content".to_string(), + )); + } + + if object_type == "blob" + && let Ok(content) = std::str::from_utf8(&data[pos..content_end]) + { + contents.insert(file_path.clone(), content.to_string()); + } + + // cat-file --batch emits a body for all non-missing objects. Always consume it, + // even when we intentionally skip non-blob objects. + pos = content_end; + if pos < data.len() && data[pos] == b'\n' { + pos += 1; + } + } + + Ok(contents) +} + /// Find the git repository that contains the given file path by walking up the directory tree. /// /// This function is useful when working with multi-repository workspaces where the workspace @@ -2926,6 +2889,47 @@ index 0000000..abc1234 100644 ); } + #[test] + fn test_parse_batch_blob_content_by_path_skips_non_blob_payload() { + let mut batch = Vec::new(); + batch.extend_from_slice(b"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa tree 6\nab\ncd\0\n"); + batch.extend_from_slice(b"bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb blob 5\nhello\n"); + + let paths = vec!["dir".to_string(), "file.txt".to_string()]; + let parsed = parse_batch_blob_content_by_path(&batch, &paths).unwrap(); + + assert_eq!(parsed.get("file.txt").map(String::as_str), Some("hello")); + assert!( + !parsed.contains_key("dir"), + "non-blob objects should be skipped" + ); + } + + #[test] + fn test_get_files_content_at_commit_skips_tree_entries_without_desync() { + use crate::git::test_utils::TmpRepo; + + let tmp_repo = TmpRepo::new().unwrap(); + tmp_repo + .write_file("dir/nested.txt", "nested\n", true) + .unwrap(); + tmp_repo.write_file("root.txt", "root\n", true).unwrap(); + tmp_repo.commit_with_message("base").unwrap(); + + let commit_sha = tmp_repo.head_commit_sha().unwrap(); + let repo = tmp_repo.gitai_repo(); + let files = vec!["dir".to_string(), "root.txt".to_string()]; + let contents = repo + .get_files_content_at_commit(&commit_sha, &files) + .unwrap(); + + assert_eq!(contents.get("root.txt").map(String::as_str), Some("root\n")); + assert!( + !contents.contains_key("dir"), + "tree entries should be skipped without affecting subsequent files" + ); + } + #[test] fn test_get_all_staged_files_content_reads_batched_index_state() { use crate::git::test_utils::TmpRepo; @@ -2999,6 +3003,39 @@ index 0000000..abc1234 100644 assert!(content.contains("generated/** linguist-generated=true")); } + #[test] + fn test_get_all_staged_files_content_skips_gitlink_without_desync() { + use crate::git::test_utils::TmpRepo; + use std::process::Command; + + let tmp_repo = TmpRepo::new().unwrap(); + tmp_repo.write_file("root.txt", "root\n", true).unwrap(); + tmp_repo.commit_with_message("base").unwrap(); + + let head_sha = tmp_repo.head_commit_sha().unwrap(); + let status = Command::new(crate::config::Config::get().git_cmd()) + .current_dir(tmp_repo.path()) + .args([ + "update-index", + "--add", + "--cacheinfo", + &format!("160000,{},submodule", head_sha), + ]) + .status() + .unwrap(); + assert!(status.success(), "failed to stage synthetic gitlink entry"); + + let repo = tmp_repo.gitai_repo(); + let files = vec!["submodule".to_string(), "root.txt".to_string()]; + let staged = repo.get_all_staged_files_content(&files).unwrap(); + + assert_eq!(staged.get("root.txt").map(String::as_str), Some("root\n")); + assert!( + !staged.contains_key("submodule"), + "gitlink entries should be skipped without affecting subsequent files" + ); + } + #[test] fn test_index_tree_oid_changes_with_staged_content() { use crate::git::test_utils::TmpRepo;