Skip to content

[User Story] Build Repository Cloning Service with Security Validation #44

@prodigy

Description

@prodigy

User Story

As a job execution system, I want to efficiently clone repositories with shallow depth and comprehensive security validation, so that jobs can start quickly while preventing malicious repository access.

Acceptance Criteria

  • Support shallow cloning with configurable depth (default: 1)
  • Clone only the specific branch needed for the job
  • Validate repository size before cloning (configurable limit, default: 1GB)
  • Scan cloned repository for security issues (secrets, malware)
  • Support sparse checkout for large repositories
  • Implement clone retry logic with exponential backoff
  • Cache frequently used repositories for performance
  • Clean up cloned repositories after job completion
  • Support Git LFS for large file handling
  • Track clone metrics (duration, size, success rate)

Technical Implementation

Repository Cloning Service

// pkg/git/clone_service.go
package git

import (
    "context"
    "github.com/go-git/go-git/v5"
    "github.com/go-git/go-git/v5/plumbing"
)

type CloneService struct {
    client      *GitClient
    scanner     *SecurityScanner
    cache       *RepositoryCache
    metrics     *MetricsCollector
    config      *CloneConfig
}

type CloneConfig struct {
    MaxRepoSize      int64         // bytes
    DefaultDepth     int
    CacheEnabled     bool
    CacheTTL         time.Duration
    ScanEnabled      bool
    RetryAttempts    int
    RetryBackoff     time.Duration
    CloneTimeout     time.Duration
}

type CloneRequest struct {
    URL          string
    Branch       string
    Depth        int
    SparsePatterns []string
    Credentials  Credentials
    JobID        string
}

func (s *CloneService) CloneRepository(ctx context.Context, req CloneRequest) (*Repository, error) {
    // Check cache first
    if s.config.CacheEnabled {
        cached, err := s.cache.Get(req.URL, req.Branch)
        if err == nil {
            s.metrics.RecordCacheHit(req.URL)
            return cached, nil
        }
    }
    
    // Validate repository before cloning
    repoInfo, err := s.validateRepository(ctx, req)
    if err != nil {
        return nil, fmt.Errorf("repository validation failed: %w", err)
    }
    
    // Clone with retry logic
    var repo *git.Repository
    err = retry.Do(func() error {
        cloneCtx, cancel := context.WithTimeout(ctx, s.config.CloneTimeout)
        defer cancel()
        
        repo, err = s.performClone(cloneCtx, req, repoInfo)
        return err
    }, 
        retry.Attempts(uint(s.config.RetryAttempts)),
        retry.Delay(s.config.RetryBackoff),
        retry.DelayType(retry.BackOffDelay),
    )
    
    if err != nil {
        s.metrics.RecordCloneFailure(req.URL, err)
        return nil, err
    }
    
    // Security scan
    if s.config.ScanEnabled {
        if err := s.scanner.ScanRepository(repo.Path); err != nil {
            s.cleanup(repo.Path)
            return nil, fmt.Errorf("security scan failed: %w", err)
        }
    }
    
    // Cache if enabled
    if s.config.CacheEnabled {
        s.cache.Put(req.URL, req.Branch, repo, s.config.CacheTTL)
    }
    
    s.metrics.RecordCloneSuccess(req.URL, time.Since(startTime))
    return &Repository{repo: repo, path: repo.Path}, nil
}

func (s *CloneService) performClone(ctx context.Context, req CloneRequest, info *RepoInfo) (*git.Repository, error) {
    opts := &git.CloneOptions{
        URL:           req.URL,
        Auth:          req.Credentials.ToAuth(),
        RemoteName:    "origin",
        ReferenceName: plumbing.NewBranchReferenceName(req.Branch),
        SingleBranch:  true,
        Depth:         req.Depth,
        Progress:      s.metrics,
        Tags:          git.NoTags,
    }
    
    // Configure sparse checkout if patterns provided
    if len(req.SparsePatterns) > 0 {
        opts.SparseCheckoutDirectories = req.SparsePatterns
    }
    
    return git.PlainCloneContext(ctx, s.getClonePath(req.JobID), false, opts)
}

Security Scanner Integration

// pkg/git/scanner.go
type SecurityScanner struct {
    secretScanner  *secrets.Scanner
    malwareScanner *malware.Scanner
    sizeChecker    *SizeChecker
}

func (s *SecurityScanner) ScanRepository(path string) error {
    // Check repository size
    size, err := s.sizeChecker.GetDirectorySize(path)
    if err != nil {
        return fmt.Errorf("size check failed: %w", err)
    }
    
    if size > s.config.MaxRepoSize {
        return fmt.Errorf("repository too large: %d bytes (max: %d)", size, s.config.MaxRepoSize)
    }
    
    // Scan for secrets
    secrets, err := s.secretScanner.ScanDirectory(path)
    if err != nil {
        return fmt.Errorf("secret scan failed: %w", err)
    }
    
    if len(secrets) > 0 {
        return fmt.Errorf("found %d potential secrets", len(secrets))
    }
    
    // Malware scan
    if threats := s.malwareScanner.Scan(path); len(threats) > 0 {
        return fmt.Errorf("malware detected: %v", threats)
    }
    
    return nil
}

Repository Caching

// pkg/git/cache.go
type RepositoryCache struct {
    storage     ObjectStorage
    index       *CacheIndex
    compressor  *Compressor
    mutex       sync.RWMutex
}

func (c *RepositoryCache) Get(url, branch string) (*Repository, error) {
    c.mutex.RLock()
    defer c.mutex.RUnlock()
    
    key := c.generateKey(url, branch)
    entry, exists := c.index.Get(key)
    if !exists {
        return nil, ErrCacheMiss
    }
    
    // Check if still valid
    if time.Now().After(entry.ExpiresAt) {
        c.index.Delete(key)
        return nil, ErrCacheExpired
    }
    
    // Restore from storage
    data, err := c.storage.Get(entry.StoragePath)
    if err != nil {
        return nil, err
    }
    
    return c.decompress(data)
}

func (c *RepositoryCache) Put(url, branch string, repo *Repository, ttl time.Duration) error {
    c.mutex.Lock()
    defer c.mutex.Unlock()
    
    key := c.generateKey(url, branch)
    
    // Compress repository
    compressed, err := c.compress(repo)
    if err != nil {
        return err
    }
    
    // Store in object storage
    storagePath := fmt.Sprintf("cache/%s.tar.gz", key)
    if err := c.storage.Put(storagePath, compressed); err != nil {
        return err
    }
    
    // Update index
    c.index.Set(key, CacheEntry{
        StoragePath: storagePath,
        CreatedAt:   time.Now(),
        ExpiresAt:   time.Now().Add(ttl),
        Size:        len(compressed),
    })
    
    return nil
}

Architecture References

Git Service Cloning Implementation

Reference: /docs/02-system-components.md:238-250

async def clone_repository(self, url: str, branch: str, depth: int = 1) -> Repository:
    """Clone repository with shallow depth and security checks"""
    # Validate repository URL
    if not self.security.validate_git_url(url):
        raise SecurityException("Invalid or unauthorized repository URL")
        
    # Clone with specific depth and single branch
    repo = await self._secure_clone(url, branch, depth)
    
    # Scan for security issues
    await self.security.scan_repository(repo)
    return repo

Container Security for Cloning

Reference: /docs/02-system-components.md:871-883

The worker containers have restricted permissions:

securityContext:
  readOnlyRootFilesystem: true
  allowPrivilegeEscalation: false
  capabilities:
    drop:
    - ALL

Performance Optimization

Reference: /docs/01-architecture-overview.md:180-185

  • Container image caching
  • Job result caching
  • Lazy loading and pagination
  • Efficient database queries with indexes

Dependencies

  • go-git/v5: Git operations
  • gitleaks: Secret scanning
  • ClamAV: Malware scanning (optional)
  • MinIO/S3: Repository cache storage
  • Redis: Cache index

Definition of Done

  • Unit tests cover all cloning scenarios with 85%+ coverage
  • Integration tests verify cloning from GitHub/GitLab/Bitbucket
  • Performance tests show <10s for typical repository clones
  • Security scans detect test secrets and malware samples
  • Cache hit rate >60% for frequently used repositories
  • Documentation includes sparse checkout examples
  • Metrics dashboard shows clone performance data

Effort Estimate

13 Story Points - Complex implementation with security scanning and caching

Labels

  • backend
  • git
  • security
  • performance
  • epic-5

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions