Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,7 @@ llm.sync.anthropic.api-key=
#RAG
codecrow.rag.api.url=http://host.docker.internal:8001
codecrow.rag.api.enabled=true
codecrow.rag.api.secret=change-me-to-a-random-secret
# RAG API timeouts (in seconds)
codecrow.rag.api.timeout.connect=30
codecrow.rag.api.timeout.read=120
Expand Down
1 change: 1 addition & 0 deletions deployment/config/mcp-client/.env.sample
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ RAG_API_URL=http://host.docker.internal:8001
# Shared secret for authenticating requests between internal services.
# Must match the SERVICE_SECRET configured on rag-pipeline.
# Leave empty to disable auth (dev mode only).
# IMPORTANT: Avoid $ { } characters in the secret — they can cause dotenv parsing issues.
SERVICE_SECRET=change-me-to-a-random-secret

# === Concurrency ===
Expand Down
1 change: 1 addition & 0 deletions deployment/config/rag-pipeline/.env.sample
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# Shared secret for authenticating incoming requests from mcp-client.
# Must match the SERVICE_SECRET configured on mcp-client.
# Leave empty to disable auth (dev mode only).
# IMPORTANT: Avoid $ { } characters in the secret — they can cause dotenv parsing issues.
SERVICE_SECRET=change-me-to-a-random-secret

# === Path Traversal Guard ===
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -224,10 +224,10 @@ public Map<String, Object> process(BranchProcessRequest request, Consumer<Map<St
"message", "Analyzing " + changedFiles.size() + " changed files"
));

updateBranchFiles(changedFiles, project, request.getTargetBranchName());
Set<String> existingFiles = updateBranchFiles(changedFiles, project, request.getTargetBranchName());
Branch branch = createOrUpdateProjectBranch(project, request);

mapCodeAnalysisIssuesToBranch(changedFiles, branch, project);
mapCodeAnalysisIssuesToBranch(changedFiles, existingFiles, branch, project);

// Always update branch issue counts after mapping (even on first analysis)
// Previously this was only done in reanalyzeCandidateIssues() which could be skipped
Expand Down Expand Up @@ -282,10 +282,15 @@ public Set<String> parseFilePathsFromDiff(String rawDiff) {
return files;
}

private void updateBranchFiles(Set<String> changedFiles, Project project, String branchName) {
/**
* Updates branch file records for changed files.
* @return the set of file paths confirmed to exist in the branch (used to avoid redundant API calls)
*/
private Set<String> updateBranchFiles(Set<String> changedFiles, Project project, String branchName) {
VcsInfo vcsInfo = getVcsInfo(project);
EVcsProvider provider = getVcsProvider(project);
VcsOperationsService operationsService = vcsServiceFactory.getOperationsService(provider);
Set<String> filesExistingInBranch = new HashSet<>();

for (String filePath : changedFiles) {
try {
Expand All @@ -303,9 +308,12 @@ private void updateBranchFiles(Set<String> changedFiles, Project project, String
log.debug("Skipping file {} - does not exist in branch {}", filePath, branchName);
continue;
}
filesExistingInBranch.add(filePath);
} catch (Exception e) {
log.warn("Failed to check file existence for {} in branch {}: {}. Proceeding anyway.",
filePath, branchName, e.getMessage());
// On error, assume the file exists so we don't skip it
filesExistingInBranch.add(filePath);
}

List<CodeAnalysisIssue> relatedIssues = codeAnalysisIssueRepository
Expand All @@ -314,7 +322,14 @@ private void updateBranchFiles(Set<String> changedFiles, Project project, String
.filter(issue -> branchName.equals(issue.getAnalysis().getBranchName()) ||
branchName.equals(issue.getAnalysis().getSourceBranchName()))
.toList();
long unresolvedCount = branchSpecific.stream().filter(i -> !i.isResolved()).count();

// Deduplicate by content key before counting — multiple analyses may
// report the same logical issue with different DB ids
Set<String> seenKeys = new HashSet<>();
long unresolvedCount = branchSpecific.stream()
.filter(i -> !i.isResolved())
.filter(i -> seenKeys.add(buildIssueContentKey(i)))
.count();

Optional<BranchFile> projectFileOptional = branchFileRepository
.findByProjectIdAndBranchNameAndFilePath(project.getId(), branchName, filePath);
Expand All @@ -333,6 +348,7 @@ private void updateBranchFiles(Set<String> changedFiles, Project project, String
branchFileRepository.save(branchFile);
}
}
return filesExistingInBranch;
}

private Branch createOrUpdateProjectBranch(Project project, BranchProcessRequest request) {
Expand All @@ -348,31 +364,14 @@ private Branch createOrUpdateProjectBranch(Project project, BranchProcessRequest
return branchRepository.save(branch);
}

private void mapCodeAnalysisIssuesToBranch(Set<String> changedFiles, Branch branch, Project project) {
VcsInfo vcsInfo = getVcsInfo(project);
EVcsProvider provider = getVcsProvider(project);
VcsOperationsService operationsService = vcsServiceFactory.getOperationsService(provider);

private void mapCodeAnalysisIssuesToBranch(Set<String> changedFiles, Set<String> filesExistingInBranch,
Branch branch, Project project) {
for (String filePath : changedFiles) {
try {
OkHttpClient client = vcsClientProvider.getHttpClient(vcsInfo.vcsConnection());

boolean fileExistsInBranch = operationsService.checkFileExistsInBranch(
client,
vcsInfo.workspace(),
vcsInfo.repoSlug(),
branch.getBranchName(),
filePath
);

if (!fileExistsInBranch) {
log.debug("Skipping issue mapping for file {} - does not exist in branch {}",
// Use cached file existence from updateBranchFiles to avoid redundant API calls
if (!filesExistingInBranch.contains(filePath)) {
log.debug("Skipping issue mapping for file {} - does not exist in branch {} (cached)",
filePath, branch.getBranchName());
continue;
}
} catch (Exception e) {
log.warn("Failed to check file existence for {} in branch {}: {}. Proceeding with mapping.",
filePath, branch.getBranchName(), e.getMessage());
continue;
}

List<CodeAnalysisIssue> allIssues = codeAnalysisIssueRepository.findByProjectIdAndFilePath(project.getId(), filePath);
Expand All @@ -387,27 +386,67 @@ private void mapCodeAnalysisIssuesToBranch(Set<String> changedFiles, Branch bran
})
.toList();

// Content-based deduplication: build a map of existing BranchIssues by content key
// to prevent the same logical issue from being linked multiple times across analyses.
// Key = "lineNumber:severity:category" — unique enough within a single file context.
List<BranchIssue> existingBranchIssues = branchIssueRepository
.findUnresolvedByBranchIdAndFilePath(branch.getId(), filePath);
Map<String, BranchIssue> contentKeyMap = new HashMap<>();
for (BranchIssue bi : existingBranchIssues) {
String key = buildIssueContentKey(bi.getCodeAnalysisIssue());
contentKeyMap.putIfAbsent(key, bi);
}

int skipped = 0;
for (CodeAnalysisIssue issue : branchSpecificIssues) {
// Tier 1: exact ID match — same CodeAnalysisIssue already linked
Optional<BranchIssue> existing = branchIssueRepository
.findByBranchIdAndCodeAnalysisIssueId(branch.getId(), issue.getId());
BranchIssue bc;

if (existing.isPresent()) {
bc = existing.get();
bc.setSeverity(issue.getSeverity());
branchIssueRepository.saveAndFlush(bc);
} else {
bc = new BranchIssue();
bc.setBranch(branch);
bc.setCodeAnalysisIssue(issue);
bc.setResolved(issue.isResolved());
BranchIssue bc = existing.get();
bc.setSeverity(issue.getSeverity());
bc.setFirstDetectedPrNumber(issue.getAnalysis() != null ? issue.getAnalysis().getPrNumber() : null);
branchIssueRepository.saveAndFlush(bc);
continue;
}

// Tier 2: content-based dedup — same logical issue from a different analysis
String contentKey = buildIssueContentKey(issue);
if (contentKeyMap.containsKey(contentKey)) {
skipped++;
continue;
}

// No match — create new BranchIssue
BranchIssue bc = new BranchIssue();
bc.setBranch(branch);
bc.setCodeAnalysisIssue(issue);
bc.setResolved(issue.isResolved());
bc.setSeverity(issue.getSeverity());
bc.setFirstDetectedPrNumber(issue.getAnalysis() != null ? issue.getAnalysis().getPrNumber() : null);
branchIssueRepository.saveAndFlush(bc);
// Register in map so subsequent issues in this batch also dedup
contentKeyMap.put(contentKey, bc);
}

if (skipped > 0) {
log.debug("Skipped {} duplicate issue(s) for file {} in branch {}",
skipped, filePath, branch.getBranchName());
}
}
}

/**
* Builds a content key for deduplication of branch issues.
* Two CodeAnalysisIssue records with the same key represent the same logical issue.
*/
private String buildIssueContentKey(CodeAnalysisIssue issue) {
return issue.getFilePath() + ":" +
issue.getLineNumber() + ":" +
issue.getSeverity() + ":" +
issue.getIssueCategory();
}

private void reanalyzeCandidateIssues(Set<String> changedFiles, Branch branch, Project project, BranchProcessRequest request, Consumer<Map<String, Object>> consumer) {
List<BranchIssue> candidateBranchIssues = new ArrayList<>();
for (String filePath : changedFiles) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@
import java.util.Map;
import java.util.Optional;

import org.rostilos.codecrow.analysisengine.util.DiffFingerprintUtil;

/**
* Generic service that handles pull request analysis.
* Uses VCS-specific services via VcsServiceFactory for provider-specific operations.
Expand Down Expand Up @@ -151,6 +153,29 @@ public Map<String, Object> process(
return Map.of("status", "cached", "cached", true);
}

// --- Fallback cache: same commit hash, any PR number (handles close/reopen) ---
Optional<CodeAnalysis> commitHashHit = codeAnalysisService.getAnalysisByCommitHash(
project.getId(), request.getCommitHash());
if (commitHashHit.isPresent()) {
log.info("Commit-hash cache hit for project={}, commit={} (source PR={}). Cloning for PR={}.",
project.getId(), request.getCommitHash(),
commitHashHit.get().getPrNumber(), request.getPullRequestId());
CodeAnalysis cloned = codeAnalysisService.cloneAnalysisForPr(
commitHashHit.get(), project, request.getPullRequestId(),
request.getCommitHash(), request.getTargetBranchName(),
request.getSourceBranchName(), commitHashHit.get().getDiffFingerprint());
try {
reportingService.postAnalysisResults(cloned, project,
request.getPullRequestId(), pullRequest.getId(),
request.getPlaceholderCommentId());
} catch (IOException e) {
log.error("Failed to post commit-hash cached results to VCS: {}", e.getMessage(), e);
}
publishAnalysisCompletedEvent(project, request, correlationId, startTime,
AnalysisCompletedEvent.CompletionStatus.SUCCESS, 0, 0, null);
return Map.of("status", "cached_by_commit", "cached", true);
}

// Get all previous analyses for this PR to provide full issue history to AI
List<CodeAnalysis> allPrAnalyses = codeAnalysisService.getAllPrAnalyses(
project.getId(),
Expand All @@ -170,6 +195,34 @@ public Map<String, Object> process(
AiAnalysisRequest aiRequest = aiClientService.buildAiAnalysisRequest(
project, request, previousAnalysis, allPrAnalyses);

// --- Diff fingerprint cache: same code changes, different PR/commit ---
String diffFingerprint = DiffFingerprintUtil.compute(aiRequest.getRawDiff());
if (diffFingerprint != null) {
Optional<CodeAnalysis> fingerprintHit = codeAnalysisService.getAnalysisByDiffFingerprint(
project.getId(), diffFingerprint);
if (fingerprintHit.isPresent()) {
log.info("Diff fingerprint cache hit for project={}, fingerprint={} (source PR={}). Cloning for PR={}.",
project.getId(), diffFingerprint.substring(0, 8) + "...",
fingerprintHit.get().getPrNumber(), request.getPullRequestId());
// TODO: Option B — LIGHTWEIGHT mode: instead of full clone, reuse Stage 1 issues
// but re-run Stage 2 cross-file analysis against the new target branch context.
CodeAnalysis cloned = codeAnalysisService.cloneAnalysisForPr(
fingerprintHit.get(), project, request.getPullRequestId(),
request.getCommitHash(), request.getTargetBranchName(),
request.getSourceBranchName(), diffFingerprint);
try {
reportingService.postAnalysisResults(cloned, project,
request.getPullRequestId(), pullRequest.getId(),
request.getPlaceholderCommentId());
} catch (IOException e) {
log.error("Failed to post fingerprint-cached results to VCS: {}", e.getMessage(), e);
}
publishAnalysisCompletedEvent(project, request, correlationId, startTime,
AnalysisCompletedEvent.CompletionStatus.SUCCESS, 0, 0, null);
return Map.of("status", "cached_by_fingerprint", "cached", true);
}
}

Map<String, Object> aiResponse = aiAnalysisClient.performAnalysis(aiRequest, event -> {
try {
log.debug("Received event from AI client: type={}", event.get("type"));
Expand All @@ -188,7 +241,8 @@ public Map<String, Object> process(
request.getSourceBranchName(),
request.getCommitHash(),
request.getPrAuthorId(),
request.getPrAuthorUsername()
request.getPrAuthorUsername(),
diffFingerprint
);

int issuesFound = newAnalysis.getIssues() != null ? newAnalysis.getIssues().size() : 0;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
package org.rostilos.codecrow.analysisengine.util;

import java.nio.charset.StandardCharsets;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

/**
* Computes a content-based fingerprint of a unified diff.
* <p>
* Only actual change lines ({@code +} / {@code -}) are included — context lines,
* hunk headers ({@code @@}), and file headers ({@code +++} / {@code ---} / {@code diff --git})
* are excluded. The change lines are sorted to make the fingerprint stable regardless
* of file ordering within the diff.
* <p>
* This allows detecting that two PRs carry the same code changes even if they target
* different branches (different merge-base → different context/hunk headers).
*/
public final class DiffFingerprintUtil {

private DiffFingerprintUtil() { /* utility */ }

/**
* Compute a SHA-256 hex digest of the normalised change lines in the given diff.
*
* @param rawDiff the filtered unified diff (may be {@code null} or empty)
* @return 64-char lowercase hex string, or {@code null} if the diff is blank
*/
public static String compute(String rawDiff) {
if (rawDiff == null || rawDiff.isBlank()) {
return null;
}

List<String> changeLines = extractChangeLines(rawDiff);
if (changeLines.isEmpty()) {
return null;
}

// Sort for stability across different file orderings
Collections.sort(changeLines);

try {
MessageDigest digest = MessageDigest.getInstance("SHA-256");
for (String line : changeLines) {
digest.update(line.getBytes(StandardCharsets.UTF_8));
digest.update((byte) '\n');
}
return bytesToHex(digest.digest());
} catch (NoSuchAlgorithmException e) {
// SHA-256 is guaranteed by the JVM spec — should never happen
throw new IllegalStateException("SHA-256 not available", e);
}
}

/**
* Extract only the actual change lines from a unified diff.
* A "change line" starts with exactly one {@code +} or {@code -} and is NOT
* a file header ({@code +++}, {@code ---}) or a diff metadata line.
*/
private static List<String> extractChangeLines(String diff) {
List<String> lines = new ArrayList<>();
// Normalise line endings
String normalised = diff.replace("\r\n", "\n").replace("\r", "\n");
for (String raw : normalised.split("\n")) {
String line = trimTrailingWhitespace(raw);
if (line.isEmpty()) {
continue;
}
char first = line.charAt(0);
if (first != '+' && first != '-') {
continue;
}
// Skip file-level headers: "+++", "---", "diff --git"
if (line.startsWith("+++") || line.startsWith("---")) {
continue;
}
if (line.startsWith("diff ")) {
continue;
}
lines.add(line);
}
return lines;
}

private static String trimTrailingWhitespace(String s) {
int end = s.length();
while (end > 0 && Character.isWhitespace(s.charAt(end - 1))) {
end--;
}
return s.substring(0, end);
}

private static String bytesToHex(byte[] bytes) {
StringBuilder sb = new StringBuilder(bytes.length * 2);
for (byte b : bytes) {
sb.append(String.format("%02x", b & 0xff));
}
return sb.toString();
}
}
Loading