From 542df3e0638639fcb49051e477427aa1d3571113 Mon Sep 17 00:00:00 2001 From: mars167 Date: Sun, 1 Feb 2026 17:03:47 +0800 Subject: [PATCH] feat(embedding): add MiniLM support and env var configuration - Add GIT_AI_EMBEDDING_MODEL environment variable override - Auto-detect embedding dimensions (MiniLM=384, CodeBERT=768) - Add docs/embedding.md with model configuration guide - Support lightweight MiniLM-L6 model for local development --- .git-ai/lancedb.tar.gz | 4 +- docs/embedding.md | 158 +++++++++++++++++++++++++++++++++ src/core/embedding/semantic.ts | 14 ++- 3 files changed, 172 insertions(+), 4 deletions(-) create mode 100644 docs/embedding.md diff --git a/.git-ai/lancedb.tar.gz b/.git-ai/lancedb.tar.gz index 1f6b1b9..2932208 100644 --- a/.git-ai/lancedb.tar.gz +++ b/.git-ai/lancedb.tar.gz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:61d6d25e063b610ec0a02893b5310abdab5b8214ce02f287c8a92abefe9543ef -size 251891 +oid sha256:91a96b77017b3dcdf69cbfd6fd59e87ae9a60e6aa5cb0c40c287b9aa694245b8 +size 255609 diff --git a/docs/embedding.md b/docs/embedding.md new file mode 100644 index 0000000..3de0eea --- /dev/null +++ b/docs/embedding.md @@ -0,0 +1,158 @@ +# Embedding Models + +git-ai uses ONNX-compatible embedding models for semantic code search. This document covers model configuration, available options, and setup instructions. + +## Overview + +The embedding system converts code snippets into vector representations for similarity search. git-ai supports: + +- **Semantic Embedding**: Neural network-based code representation (CodeBERT, MiniLM) +- **Structural Embedding**: AST-based structural features (WL kernel hashing) +- **Symbolic Embedding**: Identifier and symbol relationships + +## Configuration + +### Environment Variable + +Set `GIT_AI_EMBEDDING_MODEL` to override the default embedding model: + +```bash +export GIT_AI_EMBEDDING_MODEL="$HOME/.cache/git-ai/models/minilm/model.onnx" +``` + +Add to your shell profile for permanent use: + +```bash +# ~/.zshrc or ~/.bashrc +export GIT_AI_EMBEDDING_MODEL="$HOME/.cache/git-ai/models/minilm/model.onnx" +``` + +### Default Paths + +| Model | Default Path | +|-------|-------------| +| CodeBERT | `~/.cache/git-ai/models/codebert/model.onnx` | +| MiniLM | `~/.cache/git-ai/models/minilm/model.onnx` | + +The system automatically detects the model type and sets the appropriate embedding dimension: +- CodeBERT: 768 dimensions +- MiniLM-L6: 384 dimensions + +## Available Models + +### MiniLM-L6 (Recommended) + +Lightweight, fast model ideal for local development. + +- **Size**: ~86MB +- **Dimensions**: 384 +- **Speed**: Fast (<100ms per query) +- **Download**: + +```python +from huggingface_hub import hf_hub_download + +hf_hub_download( + repo_id="Xenova/all-MiniLM-L6-v2", + filename="onnx/model.onnx", + local_dir="$HOME/.cache/git-ai/models/minilm" +) +``` + +### CodeBERT + +Microsoft CodeBERT for code understanding. + +- **Size**: ~500MB +- **Dimensions**: 768 +- **Quality**: Higher semantic understanding +- **Download**: + +```bash +huggingface-cli download onnx-community/codebert-javascript-ONNX \ + --local-dir "$HOME/.cache/git-ai/models/codebert" +``` + +## Model Directory Structure + +``` +~/.cache/git-ai/models/ +├── codebert/ +│ ├── model.onnx # ONNX model file +│ └── config.json # Model configuration +└── minilm/ + ├── model.onnx -> onnx/model.onnx # Symlink to ONNX model + ├── onnx/ + │ └── model.onnx + └── config.json +``` + +## Fallback Behavior + +If no model is found, git-ai automatically falls back to hash-based embedding: + +- **Quality**: Good for exact matches +- **Speed**: <1ms +- **Memory**: <1MB +- **Dependencies**: None + +No crashes or service interruption when model is unavailable. + +## Performance Considerations + +| Model | Memory | CPU Inference | GPU Recommended | +|-------|--------|---------------|-----------------| +| MiniLM | ~200MB | Excellent | Optional | +| CodeBERT | ~800MB | Good | Yes | + +### Batch Processing + +Configure batch size in environment: + +```bash +export GIT_AI_EMBEDDING_BATCH_SIZE=8 +``` + +## Troubleshooting + +### Model Load Failed + +``` +{"level":"warn","msg":"semantic_embed_fallback","err":"..."} +``` + +Causes: +- Model file doesn't exist +- Corrupted model file +- Incompatible ONNX opset version + +Solution: +1. Verify model path is correct +2. Check model file is valid ONNX +3. Ensure onnxruntime-node is installed + +### Dimension Mismatch + +If you see dimension errors, verify the model path matches the expected dimension: +- MiniLM: 384 dimensions +- CodeBERT: 768 dimensions + +## Comparison + +| Aspect | MiniLM | CodeBERT | Hash Fallback | +|--------|--------|----------|---------------| +| Size | 86MB | 500MB | <1MB | +| Dimensions | 384 | 768 | N/A | +| Speed | <100ms | 100-500ms | <1ms | +| Quality | Good | Excellent | Exact matches | +| Memory | Low | High | Minimal | + +## Dependencies + +```json +{ + "onnxruntime-node": "^1.19.2" +} +``` + +Required for embedding functionality. Optional - the system works with hash fallback without it. diff --git a/src/core/embedding/semantic.ts b/src/core/embedding/semantic.ts index d142749..17bfebb 100644 --- a/src/core/embedding/semantic.ts +++ b/src/core/embedding/semantic.ts @@ -273,9 +273,19 @@ export class OnnxSemanticEmbedder implements SemanticEmbedder { } export function defaultSemanticConfig(): SemanticConfig { + // Support environment variable override + const modelPath = process.env.GIT_AI_EMBEDDING_MODEL || + path.join(os.homedir(), '.cache', 'git-ai', 'models', 'codebert', 'model.onnx'); + + // Auto-detect embedding dimension based on model path + let embeddingDim = 768; // Default for CodeBERT + if (modelPath.includes('MiniLM')) { + embeddingDim = 384; // MiniLM-L6 uses 384 dimensions + } + return { - modelName: path.join(os.homedir(), '.cache', 'git-ai', 'models', 'codebert', 'model.onnx'), - embeddingDim: 768, + modelName: modelPath, + embeddingDim, device: 'cpu', batchSize: 4, };