-
Notifications
You must be signed in to change notification settings - Fork 0
Open
Description
User Story
As a job execution system, I want to efficiently clone repositories with shallow depth and comprehensive security validation, so that jobs can start quickly while preventing malicious repository access.
Acceptance Criteria
- Support shallow cloning with configurable depth (default: 1)
- Clone only the specific branch needed for the job
- Validate repository size before cloning (configurable limit, default: 1GB)
- Scan cloned repository for security issues (secrets, malware)
- Support sparse checkout for large repositories
- Implement clone retry logic with exponential backoff
- Cache frequently used repositories for performance
- Clean up cloned repositories after job completion
- Support Git LFS for large file handling
- Track clone metrics (duration, size, success rate)
Technical Implementation
Repository Cloning Service
// pkg/git/clone_service.go
package git
import (
"context"
"github.com/go-git/go-git/v5"
"github.com/go-git/go-git/v5/plumbing"
)
type CloneService struct {
client *GitClient
scanner *SecurityScanner
cache *RepositoryCache
metrics *MetricsCollector
config *CloneConfig
}
type CloneConfig struct {
MaxRepoSize int64 // bytes
DefaultDepth int
CacheEnabled bool
CacheTTL time.Duration
ScanEnabled bool
RetryAttempts int
RetryBackoff time.Duration
CloneTimeout time.Duration
}
type CloneRequest struct {
URL string
Branch string
Depth int
SparsePatterns []string
Credentials Credentials
JobID string
}
func (s *CloneService) CloneRepository(ctx context.Context, req CloneRequest) (*Repository, error) {
// Check cache first
if s.config.CacheEnabled {
cached, err := s.cache.Get(req.URL, req.Branch)
if err == nil {
s.metrics.RecordCacheHit(req.URL)
return cached, nil
}
}
// Validate repository before cloning
repoInfo, err := s.validateRepository(ctx, req)
if err != nil {
return nil, fmt.Errorf("repository validation failed: %w", err)
}
// Clone with retry logic
var repo *git.Repository
err = retry.Do(func() error {
cloneCtx, cancel := context.WithTimeout(ctx, s.config.CloneTimeout)
defer cancel()
repo, err = s.performClone(cloneCtx, req, repoInfo)
return err
},
retry.Attempts(uint(s.config.RetryAttempts)),
retry.Delay(s.config.RetryBackoff),
retry.DelayType(retry.BackOffDelay),
)
if err != nil {
s.metrics.RecordCloneFailure(req.URL, err)
return nil, err
}
// Security scan
if s.config.ScanEnabled {
if err := s.scanner.ScanRepository(repo.Path); err != nil {
s.cleanup(repo.Path)
return nil, fmt.Errorf("security scan failed: %w", err)
}
}
// Cache if enabled
if s.config.CacheEnabled {
s.cache.Put(req.URL, req.Branch, repo, s.config.CacheTTL)
}
s.metrics.RecordCloneSuccess(req.URL, time.Since(startTime))
return &Repository{repo: repo, path: repo.Path}, nil
}
func (s *CloneService) performClone(ctx context.Context, req CloneRequest, info *RepoInfo) (*git.Repository, error) {
opts := &git.CloneOptions{
URL: req.URL,
Auth: req.Credentials.ToAuth(),
RemoteName: "origin",
ReferenceName: plumbing.NewBranchReferenceName(req.Branch),
SingleBranch: true,
Depth: req.Depth,
Progress: s.metrics,
Tags: git.NoTags,
}
// Configure sparse checkout if patterns provided
if len(req.SparsePatterns) > 0 {
opts.SparseCheckoutDirectories = req.SparsePatterns
}
return git.PlainCloneContext(ctx, s.getClonePath(req.JobID), false, opts)
}Security Scanner Integration
// pkg/git/scanner.go
type SecurityScanner struct {
secretScanner *secrets.Scanner
malwareScanner *malware.Scanner
sizeChecker *SizeChecker
}
func (s *SecurityScanner) ScanRepository(path string) error {
// Check repository size
size, err := s.sizeChecker.GetDirectorySize(path)
if err != nil {
return fmt.Errorf("size check failed: %w", err)
}
if size > s.config.MaxRepoSize {
return fmt.Errorf("repository too large: %d bytes (max: %d)", size, s.config.MaxRepoSize)
}
// Scan for secrets
secrets, err := s.secretScanner.ScanDirectory(path)
if err != nil {
return fmt.Errorf("secret scan failed: %w", err)
}
if len(secrets) > 0 {
return fmt.Errorf("found %d potential secrets", len(secrets))
}
// Malware scan
if threats := s.malwareScanner.Scan(path); len(threats) > 0 {
return fmt.Errorf("malware detected: %v", threats)
}
return nil
}Repository Caching
// pkg/git/cache.go
type RepositoryCache struct {
storage ObjectStorage
index *CacheIndex
compressor *Compressor
mutex sync.RWMutex
}
func (c *RepositoryCache) Get(url, branch string) (*Repository, error) {
c.mutex.RLock()
defer c.mutex.RUnlock()
key := c.generateKey(url, branch)
entry, exists := c.index.Get(key)
if !exists {
return nil, ErrCacheMiss
}
// Check if still valid
if time.Now().After(entry.ExpiresAt) {
c.index.Delete(key)
return nil, ErrCacheExpired
}
// Restore from storage
data, err := c.storage.Get(entry.StoragePath)
if err != nil {
return nil, err
}
return c.decompress(data)
}
func (c *RepositoryCache) Put(url, branch string, repo *Repository, ttl time.Duration) error {
c.mutex.Lock()
defer c.mutex.Unlock()
key := c.generateKey(url, branch)
// Compress repository
compressed, err := c.compress(repo)
if err != nil {
return err
}
// Store in object storage
storagePath := fmt.Sprintf("cache/%s.tar.gz", key)
if err := c.storage.Put(storagePath, compressed); err != nil {
return err
}
// Update index
c.index.Set(key, CacheEntry{
StoragePath: storagePath,
CreatedAt: time.Now(),
ExpiresAt: time.Now().Add(ttl),
Size: len(compressed),
})
return nil
}Architecture References
Git Service Cloning Implementation
Reference: /docs/02-system-components.md:238-250
async def clone_repository(self, url: str, branch: str, depth: int = 1) -> Repository:
"""Clone repository with shallow depth and security checks"""
# Validate repository URL
if not self.security.validate_git_url(url):
raise SecurityException("Invalid or unauthorized repository URL")
# Clone with specific depth and single branch
repo = await self._secure_clone(url, branch, depth)
# Scan for security issues
await self.security.scan_repository(repo)
return repoContainer Security for Cloning
Reference: /docs/02-system-components.md:871-883
The worker containers have restricted permissions:
securityContext:
readOnlyRootFilesystem: true
allowPrivilegeEscalation: false
capabilities:
drop:
- ALLPerformance Optimization
Reference: /docs/01-architecture-overview.md:180-185
- Container image caching
- Job result caching
- Lazy loading and pagination
- Efficient database queries with indexes
Dependencies
- go-git/v5: Git operations
- gitleaks: Secret scanning
- ClamAV: Malware scanning (optional)
- MinIO/S3: Repository cache storage
- Redis: Cache index
Definition of Done
- Unit tests cover all cloning scenarios with 85%+ coverage
- Integration tests verify cloning from GitHub/GitLab/Bitbucket
- Performance tests show <10s for typical repository clones
- Security scans detect test secrets and malware samples
- Cache hit rate >60% for frequently used repositories
- Documentation includes sparse checkout examples
- Metrics dashboard shows clone performance data
Effort Estimate
13 Story Points - Complex implementation with security scanning and caching
Labels
- backend
- git
- security
- performance
- epic-5