Skip to content

Implement Advanced Metrics and Evaluation System #19

@obie

Description

@obie

Overview

Implement a comprehensive metrics system including standard NLP metrics (F1, BLEU, ROUGE), LLM-as-Judge evaluation, and custom metric composition.

Description

A robust evaluation system is critical for optimizing and benchmarking DSPy programs. This includes traditional metrics, modern LLM-based evaluation, and tools for combining multiple metrics.

Key Features to Implement

  • Standard NLP metrics (exact match, F1, BLEU, ROUGE)
  • LLM-as-Judge implementation
  • Custom metric builders
  • Metric composition and weighting
  • Batch evaluation for efficiency
  • Statistical significance testing

Implementation Requirements

1. Base Metric Interface

module Desiru
  module Metrics
    class Base
      attr_reader :name, :description
      
      def initialize(name:, description: nil)
        @name = name
        @description = description
      end
      
      # Single evaluation
      def evaluate(prediction, reference)
        raise NotImplementedError
      end
      
      # Batch evaluation
      def evaluate_batch(predictions, references)
        predictions.zip(references).map do |pred, ref|
          evaluate(pred, ref)
        end
      end
      
      # Aggregate scores
      def aggregate(scores)
        {
          mean: scores.sum.to_f / scores.size,
          std: standard_deviation(scores),
          min: scores.min,
          max: scores.max,
          count: scores.size
        }
      end
    end
  end
end

2. Standard Metrics

module Desiru::Metrics
  # Exact Match
  class ExactMatch < Base
    def initialize(ignore_case: false, ignore_punctuation: false, normalize_whitespace: true)
      super(name: "exact_match")
      @ignore_case = ignore_case
      @ignore_punctuation = ignore_punctuation
      @normalize_whitespace = normalize_whitespace
    end
    
    def evaluate(prediction, reference)
      pred_text = normalize(extract_text(prediction))
      ref_text = normalize(extract_text(reference))
      
      pred_text == ref_text ? 1.0 : 0.0
    end
    
    private
    
    def normalize(text)
      text = text.strip
      text = text.downcase if @ignore_case
      text = text.gsub(/\s+/, ' ') if @normalize_whitespace
      text = text.gsub(/[[:punct:]]/, '') if @ignore_punctuation
      text
    end
  end
  
  # F1 Score
  class F1Score < Base
    def initialize(field: :answer)
      super(name: "f1_score")
      @field = field
    end
    
    def evaluate(prediction, reference)
      pred_tokens = tokenize(extract_text(prediction, @field))
      ref_tokens = tokenize(extract_text(reference, @field))
      
      if pred_tokens.empty? && ref_tokens.empty?
        return 1.0
      elsif pred_tokens.empty? || ref_tokens.empty?
        return 0.0
      end
      
      common = pred_tokens & ref_tokens
      precision = common.size.to_f / pred_tokens.size
      recall = common.size.to_f / ref_tokens.size
      
      if precision + recall == 0
        0.0
      else
        2 * (precision * recall) / (precision + recall)
      end
    end
    
    private
    
    def tokenize(text)
      text.downcase.split(/\s+/)
    end
  end
  
  # BLEU Score
  class BLEU < Base
    def initialize(n_gram: 4, smoothing: true)
      super(name: "bleu_#{n_gram}")
      @n_gram = n_gram
      @smoothing = smoothing
    end
    
    def evaluate(prediction, reference)
      pred_text = extract_text(prediction)
      ref_text = extract_text(reference)
      
      # Simplified BLEU implementation
      calculate_bleu(pred_text, ref_text)
    end
    
    private
    
    def calculate_bleu(prediction, reference)
      # Implementation of BLEU score calculation
      # Consider using existing gem like 'bleu'
    end
  end
end

3. LLM-as-Judge

module Desiru::Metrics
  class LLMJudge < Base
    DEFAULT_PROMPT = <<~PROMPT
      Evaluate the quality of the following answer to the given question.
      
      Question: {{question}}
      Reference Answer: {{reference}}
      Predicted Answer: {{prediction}}
      
      Rate the predicted answer on a scale of 1-5 where:
      1 = Completely incorrect or irrelevant
      2 = Partially correct but missing key information
      3 = Mostly correct with minor issues
      4 = Correct and complete
      5 = Excellent, potentially better than reference
      
      Provide your rating as a single number.
    PROMPT
    
    def initialize(name: "llm_judge", model: nil, prompt: DEFAULT_PROMPT, extract_score: nil)
      super(name: name)
      @model = model || Desiru.configuration.default_model
      @prompt = prompt
      @extract_score = extract_score || method(:default_score_extractor)
      @llm = setup_llm(@model)
    end
    
    def evaluate(prediction, reference)
      prompt = build_prompt(prediction, reference)
      
      response = @llm.generate(
        prompt: prompt,
        temperature: 0.1,  # Low temperature for consistency
        max_tokens: 50
      )
      
      @extract_score.call(response.text)
    end
    
    # Batch evaluation with caching
    def evaluate_batch(predictions, references)
      # Use batch API if available
      if @llm.supports_batch?
        prompts = predictions.zip(references).map { |p, r| build_prompt(p, r) }
        responses = @llm.generate_batch(prompts: prompts)
        responses.map { |r| @extract_score.call(r.text) }
      else
        super
      end
    end
    
    private
    
    def build_prompt(prediction, reference)
      @prompt
        .gsub("{{question}}", extract_text(prediction, :question))
        .gsub("{{reference}}", extract_text(reference))
        .gsub("{{prediction}}", extract_text(prediction, :answer))
    end
    
    def default_score_extractor(text)
      # Extract number from response
      match = text.match(/\b([1-5])\b/)
      match ? match[1].to_f / 5.0 : 0.0
    end
  end
end

4. Composite Metrics

module Desiru::Metrics
  class Composite < Base
    def initialize(name:, metrics:, weights: nil, aggregation: :weighted_mean)
      super(name: name)
      @metrics = metrics
      @weights = weights || Array.new(metrics.size, 1.0 / metrics.size)
      @aggregation = aggregation
      
      validate_weights\!
    end
    
    def evaluate(prediction, reference)
      scores = @metrics.map { |metric| metric.evaluate(prediction, reference) }
      
      case @aggregation
      when :weighted_mean
        scores.zip(@weights).sum { |score, weight| score * weight }
      when :min
        scores.min
      when :max
        scores.max
      when :geometric_mean
        (scores.reduce(:*) ** (1.0 / scores.size))
      else
        raise ArgumentError, "Unknown aggregation: #{@aggregation}"
      end
    end
    
    def detailed_evaluate(prediction, reference)
      results = {}
      
      @metrics.each do |metric|
        results[metric.name] = metric.evaluate(prediction, reference)
      end
      
      results[:composite] = evaluate(prediction, reference)
      results
    end
  end
end

5. Metric Builder DSL

module Desiru::Metrics
  class Builder
    def self.build(&block)
      builder = new
      builder.instance_eval(&block)
      builder.metric
    end
    
    def initialize
      @metrics = []
      @weights = []
    end
    
    def exact_match(weight: 1.0, **options)
      @metrics << ExactMatch.new(**options)
      @weights << weight
    end
    
    def f1(weight: 1.0, **options)
      @metrics << F1Score.new(**options)
      @weights << weight
    end
    
    def llm_judge(weight: 1.0, **options)
      @metrics << LLMJudge.new(**options)
      @weights << weight
    end
    
    def custom(weight: 1.0, &block)
      @metrics << Custom.new(&block)
      @weights << weight
    end
    
    def metric
      if @metrics.size == 1
        @metrics.first
      else
        Composite.new(
          name: "custom_composite",
          metrics: @metrics,
          weights: normalize_weights(@weights)
        )
      end
    end
    
    private
    
    def normalize_weights(weights)
      sum = weights.sum.to_f
      weights.map { |w| w / sum }
    end
  end
end

Example Usage

# Standard metrics
exact_match = Desiru::Metrics::ExactMatch.new(ignore_case: true)
f1 = Desiru::Metrics::F1Score.new(field: :answer)

score = exact_match.evaluate(prediction, reference)

# LLM-as-Judge
judge = Desiru::Metrics::LLMJudge.new(
  model: "gpt-4",
  prompt: <<~PROMPT
    Is this answer factually correct and complete?
    Question: {{question}}
    Answer: {{prediction}}
    Reference: {{reference}}
    
    Reply with YES or NO.
  PROMPT,
  extract_score: ->(text) { text.strip.upcase == "YES" ? 1.0 : 0.0 }
)

# Composite metric
composite = Desiru::Metrics::Composite.new(
  name: "quality_score",
  metrics: [exact_match, f1, judge],
  weights: [0.2, 0.3, 0.5]
)

# Using the builder DSL
metric = Desiru::Metrics::Builder.build do
  exact_match weight: 0.3, ignore_case: true
  f1 weight: 0.3
  llm_judge weight: 0.4, model: "gpt-4"
end

# Batch evaluation
results = metric.evaluate_batch(predictions, references)
summary = metric.aggregate(results)
puts "Mean score: #{summary[:mean]}"

# With optimizer
optimizer = BootstrapFewShot.new(
  program: my_program,
  metric: metric  # Use composite metric
)

Statistical Analysis

module Desiru::Metrics
  module Statistics
    def self.significance_test(scores1, scores2, alpha: 0.05)
      # Implement paired t-test or bootstrap test
    end
    
    def self.confidence_interval(scores, confidence: 0.95)
      # Calculate confidence interval
    end
  end
end

Testing Requirements

  • Unit tests for each metric
  • Test metric composition
  • Compare with reference implementations
  • Test batch evaluation performance
  • Test LLM judge consistency

Priority

Medium - Important for proper evaluation but basic exact match exists

Metadata

Metadata

Assignees

No one assigned

    Labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions