diff --git a/scripts/repro_attr_bloat.sh b/scripts/repro_attr_bloat.sh new file mode 100644 index 000000000..9d2cb3b8e --- /dev/null +++ b/scripts/repro_attr_bloat.sh @@ -0,0 +1,135 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Repro script: create a large file, apply many small edits, checkpoint with mock_ai, +# then commit via git-ai to stress attribution range growth and memory usage. + +REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" + +# Locate git-ai binary (override with GIT_AI_BIN=...). +GIT_AI_BIN="${GIT_AI_BIN:-}" +if [[ -z "${GIT_AI_BIN}" ]]; then + if [[ -x "${REPO_ROOT}/target/debug/git-ai" ]]; then + GIT_AI_BIN="${REPO_ROOT}/target/debug/git-ai" + elif [[ -x "${REPO_ROOT}/target/release/git-ai" ]]; then + GIT_AI_BIN="${REPO_ROOT}/target/release/git-ai" + fi +fi + +if [[ -z "${GIT_AI_BIN}" ]]; then + echo "ERROR: git-ai binary not found. Build it first (e.g., cargo build) and/or set GIT_AI_BIN." + exit 1 +fi + +# Tuning knobs (override via env) +LINES="${LINES:-200000}" # number of lines in the large file +LINE_LEN="${LINE_LEN:-120}" # approx chars per line +ITERATIONS="${ITERATIONS:-50}" # number of edit+checkpoint cycles +EDIT_STRIDE="${EDIT_STRIDE:-10}" # edit every Nth line each iteration +REPORT_EVERY="${REPORT_EVERY:-10}" # print attribution stats every N iterations +KEEP_DIR="${KEEP_DIR:-0}" # set to 1 to keep the temp repo + +# time(1) command for peak RSS (macOS: -l, Linux: -v) +TIME_CMD="" +if command -v /usr/bin/time >/dev/null 2>&1; then + if [[ "$(uname -s)" == "Darwin" ]]; then + TIME_CMD="/usr/bin/time -l" + else + TIME_CMD="/usr/bin/time -v" + fi +fi + +WORKDIR="$(mktemp -d "${TMPDIR:-/tmp}/git-ai-attr-bloat-XXXXXX")" +cleanup() { + if [[ "${KEEP_DIR}" == "1" ]]; then + echo "Repo kept at: ${WORKDIR}" + else + rm -rf "${WORKDIR}" + fi +} +trap cleanup EXIT + +cd "${WORKDIR}" +git init -q +git config user.email "repro@example.com" +git config user.name "Repro" + +echo "Generating ${LINES} lines..." +python3 - <= len(marker) + 1: + line = line[:-(len(marker)+1)] + " " + marker + else: + line = line + " " + marker + line = line + "\n" + out.append(line) +with open("big.txt", "w", encoding="utf-8") as f: + f.writelines(out) +PY + + "${GIT_AI_BIN}" checkpoint mock_ai big.txt >/dev/null + + if (( i % REPORT_EVERY == 0 )); then + base_sha="$(git rev-parse HEAD)" + checkpoints="${WORKDIR}/.git/ai/working_logs/${base_sha}/checkpoints.jsonl" + size_bytes=0 + if [[ -f "${checkpoints}" ]]; then + size_bytes=$(wc -c < "${checkpoints}") + fi + python3 - < Vec { + #[derive(Clone)] + struct AttributionCursor<'a> { + attrs: &'a [Attribution], + idx: usize, + } + + impl<'a> AttributionCursor<'a> { + fn new(attrs: &'a [Attribution]) -> Self { + Self { attrs, idx: 0 } + } + + fn overlaps_in_range(&mut self, start: usize, end: usize) -> &'a [Attribution] { + while self.idx < self.attrs.len() && self.attrs[self.idx].end <= start { + self.idx += 1; + } + let mut end_idx = self.idx; + while end_idx < self.attrs.len() && self.attrs[end_idx].start < end { + end_idx += 1; + } + &self.attrs[self.idx..end_idx] + } + } + + let mut sorted_attributions = old_attributions.to_vec(); + sorted_attributions.sort_by_key(|a| (a.start, a.end, a.ts)); + let mut new_attributions = Vec::new(); // Build lookup maps for moves @@ -766,6 +792,8 @@ impl AttributionTracker { let mut insertion_idx = 0; let mut prev_whitespace_delete = false; + let mut cursor = AttributionCursor::new(&sorted_attributions); + for diff in diffs { let op = diff.op(); let len = diff.data().len(); @@ -776,7 +804,8 @@ impl AttributionTracker { let old_range = (old_pos, old_pos + len); let new_range = (new_pos, new_pos + len); - for attr in old_attributions { + let overlaps = cursor.overlaps_in_range(old_range.0, old_range.1); + for attr in overlaps { if let Some((overlap_start, overlap_end)) = attr.intersection(old_range.0, old_range.1) { @@ -799,6 +828,7 @@ impl AttributionTracker { } ByteDiffOp::Delete => { let deletion_range = (old_pos, old_pos + len); + let deletion_overlaps = cursor.overlaps_in_range(deletion_range.0, deletion_range.1); // Check if this deletion is part of a move if let Some(mappings) = deletion_to_move.get(&deletion_idx) { @@ -810,7 +840,7 @@ impl AttributionTracker { if source_start < source_end { let target_start = insertion.start + mapping.target_range.0; - for attr in old_attributions { + for attr in deletion_overlaps { if let Some((overlap_start, overlap_end)) = attr.intersection(source_start, source_end) { @@ -1719,24 +1749,112 @@ pub fn attributions_to_line_attributions( return Vec::new(); } - // For each line, determine the dominant author + // Sort attributions by start to enable sweep-line processing. + let mut sorted_attrs: Vec<&Attribution> = attributions.iter().collect(); + sorted_attrs.sort_by_key(|a| (a.start, a.end, a.ts)); + + let mut active: Vec<&Attribution> = Vec::new(); + let mut attr_idx = 0usize; + + // For each line, determine the dominant author using only active attributions. let mut line_authors: Vec)>> = Vec::with_capacity(line_count as usize); for line_num in 1..=line_count { - let (author, overrode) = - find_dominant_author_for_line(line_num, &boundaries, attributions, content); - line_authors.push(Some((author, overrode))); + let (line_start, line_end) = boundaries.get_line_range(line_num).unwrap(); + + while attr_idx < sorted_attrs.len() && sorted_attrs[attr_idx].start < line_end { + active.push(sorted_attrs[attr_idx]); + attr_idx += 1; + } + + // Remove attributions that end before this line. + active.retain(|attr| attr.end > line_start && attr.start < line_end); + + let line_content = &content[line_start..line_end]; + let is_line_empty = + line_content.is_empty() || line_content.chars().all(|c| c.is_whitespace()); + + let mut candidate_attrs: Vec<&Attribution> = Vec::new(); + for attribution in &active { + if !attribution.overlaps(line_start, line_end) { + continue; + } + + let is_deletion_marker = attribution.start == attribution.end + && attribution.start >= line_start + && attribution.start <= line_end; + + let slice_start = std::cmp::max(line_start, attribution.start); + let slice_end = std::cmp::min(line_end, attribution.end); + let mut attr_non_whitespace_count = 0; + if slice_start < slice_end { + let safe_start = if content.is_char_boundary(slice_start) { + slice_start + } else { + floor_char_boundary(content, slice_start).max(line_start) + }; + let safe_end = if content.is_char_boundary(slice_end) { + slice_end + } else { + ceil_char_boundary(content, slice_end).min(line_end) + }; + + if safe_start < safe_end { + let content_slice = &content[safe_start..safe_end]; + attr_non_whitespace_count = + content_slice.chars().filter(|c| !c.is_whitespace()).count(); + } + } + + if attr_non_whitespace_count > 0 || is_line_empty || is_deletion_marker { + candidate_attrs.push(*attribution); + } + } + + if candidate_attrs.is_empty() { + line_authors.push(Some((CheckpointKind::Human.to_str(), None))); + continue; + } + + // Choose the author with the latest timestamp + let latest_timestamp = candidate_attrs + .iter() + .max_by_key(|a| a.ts) + .unwrap() + .ts; + let latest_author = candidate_attrs + .iter() + .filter(|a| a.ts == latest_timestamp) + .map(|a| a.author_id.clone()) + .collect::>(); + let last_ai_edit = candidate_attrs + .iter() + .filter(|a| a.author_id != CheckpointKind::Human.to_str()) + .last(); + let last_human_edit = candidate_attrs + .iter() + .filter(|a| a.author_id == CheckpointKind::Human.to_str()) + .last(); + let overrode = match (last_ai_edit, last_human_edit) { + (Some(ai), Some(h)) => { + if h.ts > ai.ts { + Some(ai.author_id.clone()) + } else { + None + } + } + _ => None, + }; + + line_authors.push(Some((latest_author[0].clone(), overrode))); } // Merge consecutive lines with the same author - let mut merged_line_authors = merge_consecutive_line_attributions(line_authors); + let merged_line_authors = merge_consecutive_line_attributions(line_authors); // Strip away all human lines (only AI lines need to be retained) - merged_line_authors.retain(|line_attr| { - line_attr.author_id != CheckpointKind::Human.to_str() || line_attr.overrode.is_some() - }); - merged_line_authors + filter_human_line_attributions(merged_line_authors) } /// Find the dominant author for a specific line based on non-whitespace character count @@ -1822,7 +1940,7 @@ fn find_dominant_author_for_line( } /// Merge consecutive lines with the same author into LineAttribution ranges -fn merge_consecutive_line_attributions( +pub(crate) fn merge_consecutive_line_attributions( line_authorship: Vec)>>, ) -> Vec { let mut result = Vec::new(); @@ -1884,6 +2002,67 @@ fn merge_consecutive_line_attributions( result } + +pub(crate) fn filter_human_line_attributions( + mut line_attributions: Vec, +) -> Vec { + line_attributions.retain(|line_attr| { + line_attr.author_id != CheckpointKind::Human.to_str() || line_attr.overrode.is_some() + }); + line_attributions +} + +pub(crate) fn build_line_authorship_from_ranges( + line_attributions: &[LineAttribution], + total_lines: u32, + default_author: &str, +) -> Vec<(String, Option)> { + let mut line_authors = vec![(default_author.to_string(), None); total_lines as usize]; + for attr in line_attributions { + let start = attr.start_line.max(1); + let end = attr.end_line.min(total_lines); + for line in start..=end { + let idx = (line - 1) as usize; + line_authors[idx] = (attr.author_id.clone(), attr.overrode.clone()); + } + } + line_authors +} + +pub(crate) fn attributions_cover_content( + content: &str, + attributions: &[Attribution], +) -> bool { + if content.is_empty() { + return true; + } + if attributions.is_empty() { + return false; + } + + let mut ranges: Vec<&Attribution> = attributions + .iter() + .filter(|attr| attr.start < attr.end) + .collect(); + if ranges.is_empty() { + return false; + } + + ranges.sort_by_key(|attr| (attr.start, attr.end)); + let mut covered_end = 0usize; + for attr in ranges { + if attr.start > covered_end { + return false; + } + if attr.end > covered_end { + covered_end = attr.end; + } + if covered_end >= content.len() { + return true; + } + } + covered_end >= content.len() +} #[cfg(test)] mod tests { use super::*; @@ -1947,6 +2126,16 @@ mod tests { ); } + #[test] + fn test_attributions_cover_content() { + let content = "hello\nworld\n"; + let attrs = vec![Attribution::new(0, content.len(), "Alice".into(), TEST_TS)]; + assert!(attributions_cover_content(content, &attrs)); + + let partial = vec![Attribution::new(0, 3, "Alice".into(), TEST_TS)]; + assert!(!attributions_cover_content(content, &partial)); + } + #[test] fn whitespace_only_indent_change_preserves_tokens() { let tracker = AttributionTracker::new(); diff --git a/src/commands/checkpoint.rs b/src/commands/checkpoint.rs index d0e10afff..e88a99b89 100644 --- a/src/commands/checkpoint.rs +++ b/src/commands/checkpoint.rs @@ -1,4 +1,8 @@ use crate::authorship::attribution_tracker::{ + attributions_cover_content, + build_line_authorship_from_ranges, + filter_human_line_attributions, + merge_consecutive_line_attributions, Attribution, AttributionTracker, INITIAL_ATTRIBUTION_TS, LineAttribution, }; use crate::authorship::authorship_log::PromptRecord; @@ -743,6 +747,7 @@ fn get_checkpoint_entry_for_file( .get_file_version(&entry.blob_sha) .unwrap_or_default(), entry.attributions.clone(), + entry.line_attributions.clone(), ) }) }); @@ -754,9 +759,10 @@ fn get_checkpoint_entry_for_file( .unwrap_or_default(); let is_from_checkpoint = from_checkpoint.is_some(); - let (previous_content, prev_attributions) = if let Some((content, attrs)) = from_checkpoint { + let (previous_content, prev_attributions, prev_line_attributions) = + if let Some((content, attrs, line_attrs)) = from_checkpoint { // File exists in a previous checkpoint - use that - (content, attrs) + (content, attrs, line_attrs) } else { // File doesn't exist in any previous checkpoint - need to initialize from git + INITIAL // Get previous content from HEAD tree @@ -795,8 +801,14 @@ fn get_checkpoint_entry_for_file( } } - // Start with INITIAL attributions (they win) - let mut prev_line_attributions = initial_attrs_for_file.clone(); + let content_for_line_conversion = if !initial_attrs_for_file.is_empty() { + ¤t_content + } else { + &previous_content + }; + let total_lines = content_for_line_conversion.lines().count() as u32; + let mut line_authors = + vec![(CheckpointKind::Human.to_str(), None); total_lines as usize]; let mut blamed_lines: HashSet = HashSet::new(); // Get blame for lines not in INITIAL @@ -829,48 +841,50 @@ fn get_checkpoint_entry_for_file( // Add blame results for lines NOT covered by INITIAL if let Some((blames, _)) = ai_blame { for (line, author) in blames { + if line == 0 || line > total_lines { + continue; + } blamed_lines.insert(line); // Skip if INITIAL already has this line if initial_covered_lines.contains(&line) { continue; } - // Skip human-authored lines - they should remain human - if author == CheckpointKind::Human.to_str() { - continue; - } - - prev_line_attributions.push(LineAttribution { - start_line: line, - end_line: line, - author_id: author.clone(), - overrode: None, - }); + line_authors[(line - 1) as usize] = (author.clone(), None); } } // For AI checkpoints, attribute any lines NOT in INITIAL and NOT returned by ai_blame if kind != CheckpointKind::Human { - let total_lines = current_content.lines().count() as u32; for line_num in 1..=total_lines { if !initial_covered_lines.contains(&line_num) && !blamed_lines.contains(&line_num) { - prev_line_attributions.push(LineAttribution { - start_line: line_num, - end_line: line_num, - author_id: author_id.as_ref().clone(), - overrode: None, - }); + line_authors[(line_num - 1) as usize] = + (author_id.as_ref().clone(), None); } } } - // For INITIAL attributions, we need to use current_content (not previous_content) - // because INITIAL line numbers refer to the current state of the file - let content_for_line_conversion = if !initial_attrs_for_file.is_empty() { - ¤t_content - } else { - &previous_content - }; + // INITIAL attributions win - overlay them last and preserve override metadata. + for attr in &initial_attrs_for_file { + let start = attr.start_line.max(1); + let end = attr.end_line.min(total_lines); + for line in start..=end { + let idx = (line - 1) as usize; + let prev_author = line_authors[idx].0.clone(); + let overrode = attr.overrode.clone().or_else(|| { + if prev_author != attr.author_id { + Some(prev_author) + } else { + None + } + }); + line_authors[idx] = (attr.author_id.clone(), overrode); + } + } + + let prev_line_attributions = merge_consecutive_line_attributions( + line_authors.into_iter().map(Some).collect(), + ); // Convert any line attributions to character attributions let prev_attributions = @@ -889,7 +903,7 @@ fn get_checkpoint_entry_for_file( previous_content }; - (adjusted_previous, prev_attributions) + (adjusted_previous, prev_attributions, prev_line_attributions) }; // Skip if no changes (but we already checked this earlier, accounting for INITIAL attributions) @@ -904,6 +918,7 @@ fn get_checkpoint_entry_for_file( author_id.as_ref(), &previous_content, &prev_attributions, + Some(&prev_line_attributions), ¤t_content, ts, )?; @@ -1072,22 +1087,89 @@ fn make_entry_for_file( author_id: &str, previous_content: &str, previous_attributions: &Vec, + previous_line_attributions: Option<&[LineAttribution]>, content: &str, ts: u128, ) -> Result<(WorkingLogEntry, FileLineStats), GitAiError> { let tracker = AttributionTracker::new(); + let content_len = content.len(); + let line_count = content.lines().count() as u32; + + let line_only = should_use_line_only(content_len, line_count, previous_attributions.len()); + if line_only { + debug_log(&format!( + "[BENCHMARK] line-only attribution mode for {} (bytes={}, lines={}, prev_attrs={})", + file_path, + content_len, + line_count, + previous_attributions.len() + )); + + let prev_line_attributions = match previous_line_attributions { + Some(line_attrs) => line_attrs.to_vec(), + None => crate::authorship::attribution_tracker::attributions_to_line_attributions( + previous_attributions, + previous_content, + ), + }; + + let line_attr_start = Instant::now(); + let new_line_attributions = line_only_update_line_attributions( + &prev_line_attributions, + previous_content, + content, + author_id, + ); + debug_log(&format!( + "[BENCHMARK] line_only_update_line_attributions for {} took {:?}", + file_path, + line_attr_start.elapsed() + )); + + let new_attributions = + crate::authorship::attribution_tracker::line_attributions_to_attributions( + &new_line_attributions, + content, + ts, + ); + + let stats_start = Instant::now(); + let line_stats = compute_file_line_stats(previous_content, content); + debug_log(&format!( + "[BENCHMARK] compute_file_line_stats for {} took {:?}", + file_path, + stats_start.elapsed() + )); + + let entry = WorkingLogEntry::new( + file_path.to_string(), + blob_sha.to_string(), + new_attributions, + new_line_attributions, + ); + + return Ok((entry, line_stats)); + } + + let needs_unattributed_fill = + !attributions_cover_content(previous_content, previous_attributions); let fill_start = Instant::now(); - let filled_in_prev_attributions = tracker.attribute_unattributed_ranges( - previous_content, - previous_attributions, - &CheckpointKind::Human.to_str(), - ts - 1, - ); + let filled_in_prev_attributions = if needs_unattributed_fill { + tracker.attribute_unattributed_ranges( + previous_content, + previous_attributions, + &CheckpointKind::Human.to_str(), + ts - 1, + ) + } else { + previous_attributions.clone() + }; debug_log(&format!( - "[BENCHMARK] attribute_unattributed_ranges for {} took {:?}", + "[BENCHMARK] attribute_unattributed_ranges for {} took {:?} (needed={})", file_path, - fill_start.elapsed() + fill_start.elapsed(), + needs_unattributed_fill )); let update_start = Instant::now(); @@ -1118,6 +1200,15 @@ fn make_entry_for_file( file_path, line_attr_start.elapsed() )); + debug_log(&format!( + "[BENCHMARK] attribution counts for {}: prev_chars={}, new_chars={}, line_ranges={}, bytes={}, lines={}", + file_path, + previous_attributions.len(), + new_attributions.len(), + line_attributions.len(), + content_len, + line_count + )); // Compute line stats while we already have both contents in memory let stats_start = Instant::now(); @@ -1138,6 +1229,81 @@ fn make_entry_for_file( Ok((entry, line_stats)) } +fn should_use_line_only(content_len: usize, line_count: u32, prev_attr_count: usize) -> bool { + let threshold_bytes = env_usize("GIT_AI_ATTR_LINE_ONLY_THRESHOLD_BYTES", 1_000_000); + let threshold_lines = env_usize("GIT_AI_ATTR_LINE_ONLY_THRESHOLD_LINES", 20_000) as u32; + let guardrail_work = env_u64("GIT_AI_ATTR_GUARDRAIL_WORK", 200_000_000); + + if threshold_bytes > 0 && content_len >= threshold_bytes { + return true; + } + if threshold_lines > 0 && line_count >= threshold_lines { + return true; + } + let estimated_work = (content_len as u64).saturating_mul(prev_attr_count as u64); + if guardrail_work > 0 && estimated_work >= guardrail_work { + debug_log(&format!( + "[Warning] Attribution guardrail triggered: content_len={} prev_attr_count={} work={}", + content_len, prev_attr_count, estimated_work + )); + return true; + } + false +} + +fn env_usize(key: &str, default_value: usize) -> usize { + std::env::var(key) + .ok() + .and_then(|v| v.parse::().ok()) + .unwrap_or(default_value) +} + +fn env_u64(key: &str, default_value: u64) -> u64 { + std::env::var(key) + .ok() + .and_then(|v| v.parse::().ok()) + .unwrap_or(default_value) +} + +fn line_only_update_line_attributions( + previous_line_attributions: &[LineAttribution], + previous_content: &str, + current_content: &str, + current_author: &str, +) -> Vec { + let prev_line_count = previous_content.lines().count() as u32; + let prev_authors = build_line_authorship_from_ranges( + previous_line_attributions, + prev_line_count, + &CheckpointKind::Human.to_str(), + ); + + let mut new_authors: Vec)>> = Vec::new(); + let mut old_idx: usize = 0; + + for change in compute_line_changes(previous_content, current_content) { + match change.tag() { + LineChangeTag::Equal => { + if let Some(author) = prev_authors.get(old_idx) { + new_authors.push(Some(author.clone())); + } else { + new_authors.push(Some((CheckpointKind::Human.to_str(), None))); + } + old_idx = old_idx.saturating_add(1); + } + LineChangeTag::Delete => { + old_idx = old_idx.saturating_add(1); + } + LineChangeTag::Insert => { + new_authors.push(Some((current_author.to_string(), None))); + } + } + } + + let merged = merge_consecutive_line_attributions(new_authors); + filter_human_line_attributions(merged) +} + /// Compute line statistics for a single file by diffing previous and current content fn compute_file_line_stats(previous_content: &str, current_content: &str) -> FileLineStats { let mut stats = FileLineStats::default(); @@ -1615,6 +1781,25 @@ mod tests { "Whitespace deletions ignored" ); } + + #[test] + fn test_line_only_update_line_attributions_basic() { + let previous = "alpha\nbeta\n"; + let current = "new\nalpha\nbeta\n"; + + let prev_line_attrs = vec![LineAttribution::new(1, 1, "mock_ai".to_string(), None)]; + let updated = + line_only_update_line_attributions(&prev_line_attrs, previous, current, "new_ai"); + + // Line 1 is new_ai, line 2 preserves mock_ai. Human-only line is omitted. + assert_eq!(updated.len(), 2); + assert_eq!(updated[0].start_line, 1); + assert_eq!(updated[0].end_line, 1); + assert_eq!(updated[0].author_id, "new_ai"); + assert_eq!(updated[1].start_line, 2); + assert_eq!(updated[1].end_line, 2); + assert_eq!(updated[1].author_id, "mock_ai"); + } } fn is_text_file(working_log: &PersistedWorkingLog, path: &str) -> bool {