From 05bf9a74fe0c9f2436c60ab978ba14ad7f9bf71e Mon Sep 17 00:00:00 2001
From: Lucas Carlson <lucas@carlson.net>
Date: Sun, 28 Dec 2025 19:04:14 -0800
Subject: [PATCH 1/7] feat(tfidf): add TF-IDF vectorizer

Adds standalone TF-IDF vectorizer for text feature extraction.
Foundation for classic text classification approaches like logistic
regression and improved LSI quality.

Features:
- fit/transform/fit_transform API (scikit-learn style)
- Vocabulary filtering via min_df/max_df thresholds
- N-gram support (unigrams, bigrams, trigrams)
- Sublinear TF scaling (1 + log(tf))
- L2 normalized output vectors
- JSON and Marshal serialization

Leverages existing word_hash infrastructure for term frequency
extraction with stemming and stopword removal.

Closes #104
---
 README.md                |  74 ++++++-
 lib/classifier.rb        |   1 +
 lib/classifier/errors.rb |   3 +
 lib/classifier/tfidf.rb  | 258 +++++++++++++++++++++++
 test/tfidf/tfidf_test.rb | 430 +++++++++++++++++++++++++++++++++++++++
 5 files changed, 765 insertions(+), 1 deletion(-)
 create mode 100644 lib/classifier/tfidf.rb
 create mode 100644 test/tfidf/tfidf_test.rb

diff --git a/README.md b/README.md
index b4e0eaa..df722e5 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@
 [![CI](https://github.com/cardmagic/classifier/actions/workflows/ruby.yml/badge.svg)](https://github.com/cardmagic/classifier/actions/workflows/ruby.yml)
 [![License: LGPL](https://img.shields.io/badge/License-LGPL_2.1-blue.svg)](https://opensource.org/licenses/LGPL-2.1)
 
-A Ruby library for text classification using Bayesian, LSI (Latent Semantic Indexing), and k-Nearest Neighbors (kNN) algorithms.
+A Ruby library for text classification using Bayesian, LSI (Latent Semantic Indexing), k-Nearest Neighbors (kNN), and TF-IDF algorithms.
 
 **[Documentation](https://rubyclassifier.com/docs)** · **[Tutorials](https://rubyclassifier.com/docs/tutorials)** · **[Guides](https://rubyclassifier.com/docs/guides)**
 
@@ -14,6 +14,7 @@ A Ruby library for text classification using Bayesian, LSI (Latent Semantic Inde
 - [Bayesian Classifier](#bayesian-classifier)
 - [LSI (Latent Semantic Indexing)](#lsi-latent-semantic-indexing)
 - [k-Nearest Neighbors (kNN)](#k-nearest-neighbors-knn)
+- [TF-IDF Vectorizer](#tf-idf-vectorizer)
 - [Persistence](#persistence)
 - [Performance](#performance)
 - [Development](#development)
@@ -256,6 +257,77 @@ knn.categories
 
 **Why the size difference?** Bayes stores aggregate statistics—adding 10,000 documents just increments counters. kNN stores every example and compares against all of them during classification, so performance degrades with size.
 
+## TF-IDF Vectorizer
+
+Transform text documents into TF-IDF (Term Frequency-Inverse Document Frequency) weighted feature vectors. TF-IDF downweights common words and upweights discriminative terms—the foundation for most classic text classification approaches.
+
+### Quick Start
+
+```ruby
+require 'classifier'
+
+tfidf = Classifier::TFIDF.new
+tfidf.fit(["Dogs are great pets", "Cats are independent", "Birds can fly"])
+
+# Transform text to TF-IDF vector (L2 normalized)
+vector = tfidf.transform("Dogs are loyal")
+# => {:dog=>0.7071..., :loyal=>0.7071...}
+
+# Fit and transform in one step
+vectors = tfidf.fit_transform(documents)
+```
+
+### Options
+
+```ruby
+tfidf = Classifier::TFIDF.new(
+  min_df: 2,           # Minimum document frequency (Integer or Float 0.0-1.0)
+  max_df: 0.95,        # Maximum document frequency (filters very common terms)
+  ngram_range: [1, 2], # Extract unigrams and bigrams
+  sublinear_tf: true   # Use 1 + log(tf) instead of raw term frequency
+)
+```
+
+### Vocabulary Inspection
+
+```ruby
+tfidf.fit(documents)
+
+tfidf.vocabulary      # => {:dog=>0, :cat=>1, :bird=>2, ...}
+tfidf.idf             # => {:dog=>1.405, :cat=>1.405, ...}
+tfidf.feature_names   # => [:dog, :cat, :bird, ...]
+tfidf.num_documents   # => 3
+tfidf.fitted?         # => true
+```
+
+### N-gram Support
+
+```ruby
+# Extract bigrams only
+tfidf = Classifier::TFIDF.new(ngram_range: [2, 2])
+tfidf.fit(["quick brown fox", "lazy brown dog"])
+tfidf.vocabulary.keys
+# => [:quick_brown, :brown_fox, :lazi_brown, :brown_dog]
+
+# Unigrams through trigrams
+tfidf = Classifier::TFIDF.new(ngram_range: [1, 3])
+```
+
+### Serialization
+
+```ruby
+# Save to JSON
+json = tfidf.to_json
+File.write("tfidf.json", json)
+
+# Load from JSON
+loaded = Classifier::TFIDF.from_json(File.read("tfidf.json"))
+
+# Or use Marshal
+data = Marshal.dump(tfidf)
+loaded = Marshal.load(data)
+```
+
 ## Persistence
 
 Save and load classifiers with pluggable storage backends. Works with Bayes, LSI, and kNN classifiers.
diff --git a/lib/classifier.rb b/lib/classifier.rb
index 81c9c90..1128590 100644
--- a/lib/classifier.rb
+++ b/lib/classifier.rb
@@ -32,3 +32,4 @@
 require 'classifier/bayes'
 require 'classifier/lsi'
 require 'classifier/knn'
+require 'classifier/tfidf'
diff --git a/lib/classifier/errors.rb b/lib/classifier/errors.rb
index e94d338..89fd6b0 100644
--- a/lib/classifier/errors.rb
+++ b/lib/classifier/errors.rb
@@ -13,4 +13,7 @@ class UnsavedChangesError < Error; end
 
   # Raised when a storage operation fails
   class StorageError < Error; end
+
+  # Raised when using an unfitted model
+  class NotFittedError < Error; end
 end
diff --git a/lib/classifier/tfidf.rb b/lib/classifier/tfidf.rb
new file mode 100644
index 0000000..64e04e6
--- /dev/null
+++ b/lib/classifier/tfidf.rb
@@ -0,0 +1,258 @@
+# rbs_inline: enabled
+
+# Author::    Lucas Carlson  (mailto:lucas@rufy.com)
+# Copyright:: Copyright (c) 2024 Lucas Carlson
+# License::   LGPL
+
+require 'json'
+
+module Classifier
+  # TF-IDF vectorizer: transforms text to weighted feature vectors.
+  # Downweights common words, upweights discriminative terms.
+  #
+  # Example:
+  #   tfidf = Classifier::TFIDF.new
+  #   tfidf.fit(["Dogs are great pets", "Cats are independent"])
+  #   tfidf.transform("Dogs are loyal")  # => {:dog=>0.7071..., :loyal=>0.7071...}
+  #
+  class TFIDF
+    # @rbs @min_df: Integer | Float
+    # @rbs @max_df: Integer | Float
+    # @rbs @ngram_range: Array[Integer]
+    # @rbs @sublinear_tf: bool
+    # @rbs @vocabulary: Hash[Symbol, Integer]
+    # @rbs @idf: Hash[Symbol, Float]
+    # @rbs @num_documents: Integer
+    # @rbs @fitted: bool
+
+    attr_reader :vocabulary, :idf, :num_documents
+
+    # Creates a new TF-IDF vectorizer.
+    # - min_df/max_df: filter terms by document frequency (Integer for count, Float for proportion)
+    # - ngram_range: [1,1] for unigrams, [1,2] for unigrams+bigrams
+    # - sublinear_tf: use 1 + log(tf) instead of raw term frequency
+    #
+    # @rbs (?min_df: Integer | Float, ?max_df: Integer | Float,
+    #       ?ngram_range: Array[Integer], ?sublinear_tf: bool) -> void
+    def initialize(min_df: 1, max_df: 1.0, ngram_range: [1, 1], sublinear_tf: false)
+      validate_df!(min_df, 'min_df')
+      validate_df!(max_df, 'max_df')
+      validate_ngram_range!(ngram_range)
+
+      @min_df = min_df
+      @max_df = max_df
+      @ngram_range = ngram_range
+      @sublinear_tf = sublinear_tf
+      @vocabulary = {}
+      @idf = {}
+      @num_documents = 0
+      @fitted = false
+    end
+
+    # Learns vocabulary and IDF weights from the corpus.
+    # @rbs (Array[String]) -> self
+    def fit(documents)
+      raise ArgumentError, 'documents must be an array' unless documents.is_a?(Array)
+      raise ArgumentError, 'documents cannot be empty' if documents.empty?
+
+      @num_documents = documents.size
+      document_frequencies = Hash.new(0)
+
+      documents.each do |doc|
+        terms = extract_terms(doc)
+        terms.each_key { |term| document_frequencies[term] += 1 }
+      end
+
+      @vocabulary = {}
+      @idf = {}
+      vocab_index = 0
+
+      document_frequencies.each do |term, df|
+        next unless within_df_bounds?(df, @num_documents)
+
+        @vocabulary[term] = vocab_index
+        vocab_index += 1
+
+        # IDF: log((N + 1) / (df + 1)) + 1 with smoothing
+        @idf[term] = Math.log((@num_documents + 1).to_f / (df + 1)) + 1
+      end
+
+      @fitted = true
+      self
+    end
+
+    # Transforms a document into a normalized TF-IDF vector.
+    # @rbs (String) -> Hash[Symbol, Float]
+    def transform(document)
+      raise NotFittedError, 'TFIDF has not been fitted. Call fit first.' unless @fitted
+
+      terms = extract_terms(document)
+      result = {}
+
+      terms.each do |term, tf|
+        next unless @vocabulary.key?(term)
+
+        tf_value = @sublinear_tf && tf.positive? ? 1 + Math.log(tf) : tf.to_f
+        result[term] = tf_value * @idf[term]
+      end
+
+      normalize_vector(result)
+    end
+
+    # Fits and transforms in one step.
+    # @rbs (Array[String]) -> Array[Hash[Symbol, Float]]
+    def fit_transform(documents)
+      fit(documents)
+      documents.map { |doc| transform(doc) }
+    end
+
+    # Returns vocabulary terms in index order.
+    # @rbs () -> Array[Symbol]
+    def feature_names
+      @vocabulary.keys.sort_by { |term| @vocabulary[term] }
+    end
+
+    # @rbs () -> bool
+    def fitted?
+      @fitted
+    end
+
+    # @rbs (?untyped) -> Hash[Symbol, untyped]
+    def as_json(_options = nil)
+      {
+        version: 1,
+        type: 'tfidf',
+        min_df: @min_df,
+        max_df: @max_df,
+        ngram_range: @ngram_range,
+        sublinear_tf: @sublinear_tf,
+        vocabulary: @vocabulary,
+        idf: @idf,
+        num_documents: @num_documents,
+        fitted: @fitted
+      }
+    end
+
+    # @rbs (?untyped) -> String
+    def to_json(_options = nil)
+      as_json.to_json
+    end
+
+    # Loads a vectorizer from JSON.
+    # @rbs (String | Hash[String, untyped]) -> TFIDF
+    def self.from_json(json)
+      data = json.is_a?(String) ? JSON.parse(json) : json
+      raise ArgumentError, "Invalid vectorizer type: #{data['type']}" unless data['type'] == 'tfidf'
+
+      instance = new(
+        min_df: data['min_df'],
+        max_df: data['max_df'],
+        ngram_range: data['ngram_range'],
+        sublinear_tf: data['sublinear_tf']
+      )
+
+      instance.instance_variable_set(:@vocabulary, symbolize_keys(data['vocabulary']))
+      instance.instance_variable_set(:@idf, symbolize_keys(data['idf']))
+      instance.instance_variable_set(:@num_documents, data['num_documents'])
+      instance.instance_variable_set(:@fitted, data['fitted'])
+
+      instance
+    end
+
+    # @rbs () -> Array[untyped]
+    def marshal_dump
+      [@min_df, @max_df, @ngram_range, @sublinear_tf, @vocabulary, @idf, @num_documents, @fitted]
+    end
+
+    # @rbs (Array[untyped]) -> void
+    def marshal_load(data)
+      @min_df, @max_df, @ngram_range, @sublinear_tf, @vocabulary, @idf, @num_documents, @fitted = data
+    end
+
+    private
+
+    # @rbs (String) -> Hash[Symbol, Integer]
+    def extract_terms(document)
+      result = Hash.new(0)
+
+      if @ngram_range[0] <= 1
+        word_hash = document.clean_word_hash
+        word_hash.each { |term, count| result[term] += count }
+      end
+
+      if @ngram_range[1] > 1
+        tokens = tokenize_for_ngrams(document)
+        (2..@ngram_range[1]).each do |n|
+          next if n < @ngram_range[0]
+
+          generate_ngrams(tokens, n).each { |ngram| result[ngram] += 1 }
+        end
+      end
+
+      result
+    end
+
+    # @rbs (String) -> Array[String]
+    def tokenize_for_ngrams(document)
+      document
+        .gsub(/[^\w\s]/, '')
+        .split
+        .map(&:downcase)
+        .reject { |w| w.length <= 2 || String::CORPUS_SKIP_WORDS.include?(w) }
+        .map(&:stem)
+    end
+
+    # @rbs (Array[String], Integer) -> Array[Symbol]
+    def generate_ngrams(tokens, n) # rubocop:disable Naming/MethodParameterName
+      return [] if tokens.size < n
+
+      tokens.each_cons(n).map { |gram| gram.join('_').intern }
+    end
+
+    # @rbs (Integer, Integer) -> bool
+    def within_df_bounds?(doc_freq, num_docs)
+      min_count = @min_df.is_a?(Float) ? (@min_df * num_docs).ceil : @min_df
+      max_count = @max_df.is_a?(Float) ? (@max_df * num_docs).floor : @max_df
+
+      doc_freq.between?(min_count, max_count)
+    end
+
+    # @rbs (Hash[Symbol, Float]) -> Hash[Symbol, Float]
+    def normalize_vector(vector)
+      return vector if vector.empty?
+
+      magnitude = Math.sqrt(vector.values.sum { |v| v * v })
+      return vector if magnitude.zero?
+
+      vector.transform_values { |v| v / magnitude }
+    end
+
+    # @rbs (Integer | Float, String) -> void
+    def validate_df!(value, name)
+      if value.is_a?(Float)
+        raise ArgumentError, "#{name} must be between 0.0 and 1.0" unless value.between?(0.0, 1.0)
+      elsif value.is_a?(Integer)
+        raise ArgumentError, "#{name} must be non-negative" if value.negative?
+      else
+        raise ArgumentError, "#{name} must be an Integer or Float"
+      end
+    end
+
+    # @rbs (Array[Integer]) -> void
+    def validate_ngram_range!(range)
+      valid_structure = range.is_a?(Array) && range.size == 2
+      raise ArgumentError, 'ngram_range must be an array of two integers' unless valid_structure
+
+      valid_values = range.all? { |v| v.is_a?(Integer) && v.positive? }
+      raise ArgumentError, 'ngram_range values must be positive integers' unless valid_values
+
+      raise ArgumentError, 'ngram_range[0] must be <= ngram_range[1]' if range[0] > range[1]
+    end
+
+    # @rbs (Hash[String, untyped]) -> Hash[Symbol, untyped]
+    def self.symbolize_keys(hash)
+      hash.transform_keys(&:to_sym)
+    end
+    private_class_method :symbolize_keys
+  end
+end
diff --git a/test/tfidf/tfidf_test.rb b/test/tfidf/tfidf_test.rb
new file mode 100644
index 0000000..d2a2bbb
--- /dev/null
+++ b/test/tfidf/tfidf_test.rb
@@ -0,0 +1,430 @@
+require_relative '../test_helper'
+
+class TFIDFTest < Minitest::Test
+  def setup
+    @doc1 = 'Dogs are great pets and very loyal'
+    @doc2 = 'Cats are independent and self-sufficient'
+    @doc3 = 'Birds can fly and sing beautiful songs'
+    @doc4 = 'Dogs and cats are popular pets'
+    @corpus = [@doc1, @doc2, @doc3, @doc4]
+  end
+
+  # Initialization tests
+
+  def test_default_initialization
+    tfidf = Classifier::TFIDF.new
+
+    refute_predicate tfidf, :fitted?
+    assert_empty tfidf.vocabulary
+    assert_empty tfidf.idf
+    assert_equal 0, tfidf.num_documents
+  end
+
+  def test_custom_min_df_integer
+    tfidf = Classifier::TFIDF.new(min_df: 2)
+
+    tfidf.fit(@corpus)
+
+    # Terms appearing in only 1 document should be excluded
+    tfidf.vocabulary.each_key do |term|
+      doc_count = @corpus.count { |doc| doc.clean_word_hash.key?(term) }
+      assert_operator doc_count, :>=, 2, "Term #{term} should appear in at least 2 documents"
+    end
+  end
+
+  def test_custom_min_df_float
+    tfidf = Classifier::TFIDF.new(min_df: 0.5)
+
+    tfidf.fit(@corpus)
+
+    # Terms appearing in less than 50% of documents should be excluded
+    min_count = (@corpus.size * 0.5).ceil
+    tfidf.vocabulary.each_key do |term|
+      doc_count = @corpus.count { |doc| doc.clean_word_hash.key?(term) }
+      assert_operator doc_count, :>=, min_count
+    end
+  end
+
+  def test_custom_max_df_integer
+    tfidf = Classifier::TFIDF.new(max_df: 2)
+
+    tfidf.fit(@corpus)
+
+    # Terms appearing in more than 2 documents should be excluded
+    tfidf.vocabulary.each_key do |term|
+      doc_count = @corpus.count { |doc| doc.clean_word_hash.key?(term) }
+      assert_operator doc_count, :<=, 2
+    end
+  end
+
+  def test_custom_max_df_float
+    tfidf = Classifier::TFIDF.new(max_df: 0.5)
+
+    tfidf.fit(@corpus)
+
+    # Terms appearing in more than 50% of documents should be excluded
+    max_count = (@corpus.size * 0.5).floor
+    tfidf.vocabulary.each_key do |term|
+      doc_count = @corpus.count { |doc| doc.clean_word_hash.key?(term) }
+      assert_operator doc_count, :<=, max_count
+    end
+  end
+
+  def test_invalid_min_df_raises
+    assert_raises(ArgumentError) { Classifier::TFIDF.new(min_df: -1) }
+    assert_raises(ArgumentError) { Classifier::TFIDF.new(min_df: 1.5) }
+    assert_raises(ArgumentError) { Classifier::TFIDF.new(min_df: 'invalid') }
+  end
+
+  def test_invalid_max_df_raises
+    assert_raises(ArgumentError) { Classifier::TFIDF.new(max_df: -1) }
+    assert_raises(ArgumentError) { Classifier::TFIDF.new(max_df: 1.5) }
+    assert_raises(ArgumentError) { Classifier::TFIDF.new(max_df: 'invalid') }
+  end
+
+  def test_invalid_ngram_range_raises
+    assert_raises(ArgumentError) { Classifier::TFIDF.new(ngram_range: [2, 1]) }
+    assert_raises(ArgumentError) { Classifier::TFIDF.new(ngram_range: [0, 1]) }
+    assert_raises(ArgumentError) { Classifier::TFIDF.new(ngram_range: [1]) }
+    assert_raises(ArgumentError) { Classifier::TFIDF.new(ngram_range: 'invalid') }
+  end
+
+  # Fit tests
+
+  def test_fit_builds_vocabulary
+    tfidf = Classifier::TFIDF.new
+
+    tfidf.fit(@corpus)
+
+    assert_predicate tfidf, :fitted?
+    refute_empty tfidf.vocabulary
+    assert_equal @corpus.size, tfidf.num_documents
+  end
+
+  def test_fit_computes_idf
+    tfidf = Classifier::TFIDF.new
+
+    tfidf.fit(@corpus)
+
+    refute_empty tfidf.idf
+    assert_equal tfidf.vocabulary.size, tfidf.idf.size
+
+    # All IDF values should be positive
+    tfidf.idf.each_value do |idf_value|
+      assert_operator idf_value, :>, 0
+    end
+  end
+
+  def test_fit_idf_ordering
+    # Terms appearing in fewer documents should have higher IDF
+    docs = [
+      'apple banana cherry',
+      'apple banana date',
+      'apple elderberry fig'
+    ]
+    tfidf = Classifier::TFIDF.new
+
+    tfidf.fit(docs)
+
+    # 'appl' appears in all 3 docs, 'banana' in 2, others in 1
+    # IDF should be: rare terms > common terms
+    assert_operator tfidf.idf[:elderberri], :>, tfidf.idf[:banana]
+    assert_operator tfidf.idf[:banana], :>, tfidf.idf[:appl]
+  end
+
+  def test_fit_returns_self
+    tfidf = Classifier::TFIDF.new
+
+    result = tfidf.fit(@corpus)
+
+    assert_same tfidf, result
+  end
+
+  def test_fit_with_empty_array_raises
+    tfidf = Classifier::TFIDF.new
+
+    assert_raises(ArgumentError) { tfidf.fit([]) }
+  end
+
+  def test_fit_with_non_array_raises
+    tfidf = Classifier::TFIDF.new
+
+    assert_raises(ArgumentError) { tfidf.fit('not an array') }
+  end
+
+  # Transform tests
+
+  def test_transform_returns_tfidf_vector
+    tfidf = Classifier::TFIDF.new
+    tfidf.fit(@corpus)
+
+    vector = tfidf.transform('Dogs are loyal pets')
+
+    assert_instance_of Hash, vector
+    refute_empty vector
+    vector.each_value { |v| assert_kind_of Float, v }
+  end
+
+  def test_transform_before_fit_raises
+    tfidf = Classifier::TFIDF.new
+
+    assert_raises(Classifier::NotFittedError) { tfidf.transform('Some text') }
+  end
+
+  def test_transform_normalizes_vector
+    tfidf = Classifier::TFIDF.new
+    tfidf.fit(@corpus)
+
+    vector = tfidf.transform('Dogs are loyal pets')
+
+    # L2 norm should be 1 (or close to it due to floating point)
+    magnitude = Math.sqrt(vector.values.sum { |v| v * v })
+    assert_in_delta 1.0, magnitude, 0.0001
+  end
+
+  def test_transform_unknown_terms_ignored
+    tfidf = Classifier::TFIDF.new
+    tfidf.fit(['apple banana', 'cherry date'])
+
+    # 'xyz' is not in vocabulary
+    vector = tfidf.transform('apple xyz')
+
+    refute vector.key?(:xyz)
+    assert vector.key?(:appl)
+  end
+
+  def test_transform_empty_result_for_unknown_text
+    tfidf = Classifier::TFIDF.new
+    tfidf.fit(['apple banana', 'cherry date'])
+
+    vector = tfidf.transform('xyz uvw')
+
+    assert_empty vector
+  end
+
+  # fit_transform tests
+
+  def test_fit_transform
+    tfidf = Classifier::TFIDF.new
+
+    vectors = tfidf.fit_transform(@corpus)
+
+    assert_predicate tfidf, :fitted?
+    assert_equal @corpus.size, vectors.size
+    vectors.each { |v| assert_instance_of Hash, v }
+  end
+
+  # Sublinear TF tests
+
+  def test_sublinear_tf
+    # Create document with repeated term
+    doc_with_repeats = 'dog dog dog dog cat'
+    corpus = [doc_with_repeats, 'bird fish']
+
+    tfidf_linear = Classifier::TFIDF.new(sublinear_tf: false)
+    tfidf_sublinear = Classifier::TFIDF.new(sublinear_tf: true)
+
+    tfidf_linear.fit(corpus)
+    tfidf_sublinear.fit(corpus)
+
+    vec_linear = tfidf_linear.transform(doc_with_repeats)
+    vec_sublinear = tfidf_sublinear.transform(doc_with_repeats)
+
+    # With sublinear TF, the ratio of dog to cat should be smaller
+    # because 1 + log(4) < 4 (relative to 1 + log(1) = 1)
+    ratio_linear = vec_linear[:dog] / vec_linear[:cat]
+    ratio_sublinear = vec_sublinear[:dog] / vec_sublinear[:cat]
+
+    assert_operator ratio_sublinear, :<, ratio_linear
+  end
+
+  # N-gram tests
+
+  def test_bigrams
+    tfidf = Classifier::TFIDF.new(ngram_range: [1, 2])
+
+    tfidf.fit(['quick brown fox', 'lazy brown dog'])
+
+    # Should have bigrams in vocabulary
+    bigram_terms = tfidf.vocabulary.keys.select { |t| t.to_s.include?('_') }
+    refute_empty bigram_terms, 'Should have bigram terms'
+  end
+
+  def test_bigrams_only
+    tfidf = Classifier::TFIDF.new(ngram_range: [2, 2])
+
+    tfidf.fit(['quick brown fox', 'lazy brown dog'])
+
+    # Should only have bigrams (terms with underscore)
+    tfidf.vocabulary.each_key do |term|
+      assert term.to_s.include?('_'), "Term #{term} should be a bigram"
+    end
+  end
+
+  def test_trigrams
+    tfidf = Classifier::TFIDF.new(ngram_range: [1, 3])
+
+    tfidf.fit(['quick brown fox jumps', 'lazy brown dog runs'])
+
+    trigram_terms = tfidf.vocabulary.keys.select { |t| t.to_s.count('_') == 2 }
+    refute_empty trigram_terms, 'Should have trigram terms'
+  end
+
+  # feature_names tests
+
+  def test_feature_names
+    tfidf = Classifier::TFIDF.new
+    tfidf.fit(@corpus)
+
+    names = tfidf.feature_names
+
+    assert_instance_of Array, names
+    assert_equal tfidf.vocabulary.size, names.size
+    names.each { |n| assert_instance_of Symbol, n }
+  end
+
+  # Serialization tests
+
+  def test_as_json
+    tfidf = Classifier::TFIDF.new(min_df: 2, sublinear_tf: true)
+    tfidf.fit(@corpus)
+
+    data = tfidf.as_json
+
+    assert_equal 1, data[:version]
+    assert_equal 'tfidf', data[:type]
+    assert_equal 2, data[:min_df]
+    assert data[:sublinear_tf]
+    assert data[:fitted]
+    refute_empty data[:vocabulary]
+    refute_empty data[:idf]
+  end
+
+  def test_to_json
+    tfidf = Classifier::TFIDF.new
+    tfidf.fit(@corpus)
+
+    json = tfidf.to_json
+    data = JSON.parse(json)
+
+    assert_equal 'tfidf', data['type']
+    assert data['fitted']
+  end
+
+  def test_from_json_string
+    tfidf = Classifier::TFIDF.new(min_df: 2, sublinear_tf: true)
+    tfidf.fit(@corpus)
+
+    json = tfidf.to_json
+    loaded = Classifier::TFIDF.from_json(json)
+
+    assert_predicate loaded, :fitted?
+    assert_equal tfidf.vocabulary.size, loaded.vocabulary.size
+    assert_equal tfidf.num_documents, loaded.num_documents
+
+    # Transform should produce same results
+    original_vec = tfidf.transform('Dogs are great')
+    loaded_vec = loaded.transform('Dogs are great')
+    assert_equal original_vec, loaded_vec
+  end
+
+  def test_from_json_hash
+    tfidf = Classifier::TFIDF.new
+    tfidf.fit(@corpus)
+
+    hash = JSON.parse(tfidf.to_json)
+    loaded = Classifier::TFIDF.from_json(hash)
+
+    assert_predicate loaded, :fitted?
+    assert_equal tfidf.vocabulary.size, loaded.vocabulary.size
+  end
+
+  def test_from_json_invalid_type_raises
+    invalid_json = { version: 1, type: 'invalid' }.to_json
+
+    assert_raises(ArgumentError) { Classifier::TFIDF.from_json(invalid_json) }
+  end
+
+  # Marshal tests
+
+  def test_marshal_dump_load
+    tfidf = Classifier::TFIDF.new(min_df: 2, sublinear_tf: true)
+    tfidf.fit(@corpus)
+
+    dumped = Marshal.dump(tfidf)
+    loaded = Marshal.load(dumped) # rubocop:disable Security/MarshalLoad
+
+    assert_predicate loaded, :fitted?
+    assert_equal tfidf.vocabulary, loaded.vocabulary
+    assert_equal tfidf.idf, loaded.idf
+
+    # Transform should produce same results
+    original_vec = tfidf.transform('Dogs are great')
+    loaded_vec = loaded.transform('Dogs are great')
+    assert_equal original_vec, loaded_vec
+  end
+
+  # Edge cases
+
+  def test_single_document_corpus
+    tfidf = Classifier::TFIDF.new
+
+    tfidf.fit(['Single document with words'])
+
+    assert_predicate tfidf, :fitted?
+    refute_empty tfidf.vocabulary
+  end
+
+  def test_document_with_only_stopwords
+    tfidf = Classifier::TFIDF.new
+    tfidf.fit(['the and or but', 'dog cat bird'])
+
+    # Transform a document with only stopwords
+    vector = tfidf.transform('the and or but')
+
+    assert_empty vector
+  end
+
+  def test_repeated_fit_overwrites
+    tfidf = Classifier::TFIDF.new
+
+    tfidf.fit(['apple banana'])
+    first_vocab = tfidf.vocabulary.dup
+
+    tfidf.fit(['cherry date elderberry'])
+
+    refute_equal first_vocab, tfidf.vocabulary
+  end
+
+  def test_unicode_text
+    tfidf = Classifier::TFIDF.new
+
+    tfidf.fit(['Caf manger boire', 'chteau jardin maison'])
+    vector = tfidf.transform('Caf jardin')
+
+    refute_empty vector
+  end
+
+  def test_very_long_document
+    long_doc = (['word'] * 1000).join(' ')
+    tfidf = Classifier::TFIDF.new
+
+    tfidf.fit([long_doc, 'short document'])
+    vector = tfidf.transform(long_doc)
+
+    refute_empty vector
+    # Should still be normalized
+    magnitude = Math.sqrt(vector.values.sum { |v| v * v })
+    assert_in_delta 1.0, magnitude, 0.0001 unless vector.empty?
+  end
+
+  def test_empty_document_in_corpus
+    # Empty strings should not cause issues
+    tfidf = Classifier::TFIDF.new
+
+    tfidf.fit(['dog cat', '', 'bird fish'])
+
+    assert_predicate tfidf, :fitted?
+    assert_equal 3, tfidf.num_documents
+  end
+end

From 293dece148358ffd25eca853ea603c220e68c625 Mon Sep 17 00:00:00 2001
From: Lucas Carlson <lucas@carlson.net>
Date: Sun, 28 Dec 2025 19:22:12 -0800
Subject: [PATCH 2/7] refactor(tfidf): flatten validation methods with early
 returns

Replace nested if/elsif/else conditionals in validate_df! and
validate_ngram_range! with guard clauses for better readability.
Inline single-use intermediate variables. Remove redundant "with
smoothing" from IDF formula comment.

Addresses style feedback from PR #107 review.
---
 lib/classifier/tfidf.rb | 22 ++++++++--------------
 1 file changed, 8 insertions(+), 14 deletions(-)

diff --git a/lib/classifier/tfidf.rb b/lib/classifier/tfidf.rb
index 64e04e6..e90c1df 100644
--- a/lib/classifier/tfidf.rb
+++ b/lib/classifier/tfidf.rb
@@ -73,7 +73,7 @@ def fit(documents)
         @vocabulary[term] = vocab_index
         vocab_index += 1
 
-        # IDF: log((N + 1) / (df + 1)) + 1 with smoothing
+        # IDF: log((N + 1) / (df + 1)) + 1
         @idf[term] = Math.log((@num_documents + 1).to_f / (df + 1)) + 1
       end
 
@@ -229,23 +229,17 @@ def normalize_vector(vector)
 
     # @rbs (Integer | Float, String) -> void
     def validate_df!(value, name)
-      if value.is_a?(Float)
-        raise ArgumentError, "#{name} must be between 0.0 and 1.0" unless value.between?(0.0, 1.0)
-      elsif value.is_a?(Integer)
-        raise ArgumentError, "#{name} must be non-negative" if value.negative?
-      else
-        raise ArgumentError, "#{name} must be an Integer or Float"
-      end
+      raise ArgumentError, "#{name} must be an Integer or Float" unless value.is_a?(Float) || value.is_a?(Integer)
+      raise ArgumentError, "#{name} must be between 0.0 and 1.0" if value.is_a?(Float) && !value.between?(0.0, 1.0)
+      raise ArgumentError, "#{name} must be non-negative" if value.is_a?(Integer) && value.negative?
     end
 
     # @rbs (Array[Integer]) -> void
     def validate_ngram_range!(range)
-      valid_structure = range.is_a?(Array) && range.size == 2
-      raise ArgumentError, 'ngram_range must be an array of two integers' unless valid_structure
-
-      valid_values = range.all? { |v| v.is_a?(Integer) && v.positive? }
-      raise ArgumentError, 'ngram_range values must be positive integers' unless valid_values
-
+      raise ArgumentError, 'ngram_range must be an array of two integers' unless range.is_a?(Array) && range.size == 2
+      raise ArgumentError, 'ngram_range values must be positive integers' unless range.all? do |v|
+        v.is_a?(Integer) && v.positive?
+      end
       raise ArgumentError, 'ngram_range[0] must be <= ngram_range[1]' if range[0] > range[1]
     end
 

From 748a20943538ac81acc9842887298e1e1de15bca Mon Sep 17 00:00:00 2001
From: Lucas Carlson <lucas@carlson.net>
Date: Sun, 28 Dec 2025 19:27:22 -0800
Subject: [PATCH 3/7] fix(tfidf): resolve lint and typecheck errors

- Add type annotation to empty hash in transform method
- Use JSON.generate instead of Hash#to_json for type safety
- Cast multiplication result to Float for type checker
- Auto-fix Minitest assertion style in tests
---
 lib/classifier/tfidf.rb  |  6 +++---
 test/tfidf/tfidf_test.rb | 13 +++++++++++--
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/lib/classifier/tfidf.rb b/lib/classifier/tfidf.rb
index e90c1df..7e12752 100644
--- a/lib/classifier/tfidf.rb
+++ b/lib/classifier/tfidf.rb
@@ -87,13 +87,13 @@ def transform(document)
       raise NotFittedError, 'TFIDF has not been fitted. Call fit first.' unless @fitted
 
       terms = extract_terms(document)
-      result = {}
+      result = {} #: Hash[Symbol, Float]
 
       terms.each do |term, tf|
         next unless @vocabulary.key?(term)
 
         tf_value = @sublinear_tf && tf.positive? ? 1 + Math.log(tf) : tf.to_f
-        result[term] = tf_value * @idf[term]
+        result[term] = (tf_value * @idf[term]).to_f
       end
 
       normalize_vector(result)
@@ -135,7 +135,7 @@ def as_json(_options = nil)
 
     # @rbs (?untyped) -> String
     def to_json(_options = nil)
-      as_json.to_json
+      JSON.generate(as_json)
     end
 
     # Loads a vectorizer from JSON.
diff --git a/test/tfidf/tfidf_test.rb b/test/tfidf/tfidf_test.rb
index d2a2bbb..fde00e2 100644
--- a/test/tfidf/tfidf_test.rb
+++ b/test/tfidf/tfidf_test.rb
@@ -28,6 +28,7 @@ def test_custom_min_df_integer
     # Terms appearing in only 1 document should be excluded
     tfidf.vocabulary.each_key do |term|
       doc_count = @corpus.count { |doc| doc.clean_word_hash.key?(term) }
+
       assert_operator doc_count, :>=, 2, "Term #{term} should appear in at least 2 documents"
     end
   end
@@ -41,6 +42,7 @@ def test_custom_min_df_float
     min_count = (@corpus.size * 0.5).ceil
     tfidf.vocabulary.each_key do |term|
       doc_count = @corpus.count { |doc| doc.clean_word_hash.key?(term) }
+
       assert_operator doc_count, :>=, min_count
     end
   end
@@ -53,6 +55,7 @@ def test_custom_max_df_integer
     # Terms appearing in more than 2 documents should be excluded
     tfidf.vocabulary.each_key do |term|
       doc_count = @corpus.count { |doc| doc.clean_word_hash.key?(term) }
+
       assert_operator doc_count, :<=, 2
     end
   end
@@ -66,6 +69,7 @@ def test_custom_max_df_float
     max_count = (@corpus.size * 0.5).floor
     tfidf.vocabulary.each_key do |term|
       doc_count = @corpus.count { |doc| doc.clean_word_hash.key?(term) }
+
       assert_operator doc_count, :<=, max_count
     end
   end
@@ -179,6 +183,7 @@ def test_transform_normalizes_vector
 
     # L2 norm should be 1 (or close to it due to floating point)
     magnitude = Math.sqrt(vector.values.sum { |v| v * v })
+
     assert_in_delta 1.0, magnitude, 0.0001
   end
 
@@ -247,6 +252,7 @@ def test_bigrams
 
     # Should have bigrams in vocabulary
     bigram_terms = tfidf.vocabulary.keys.select { |t| t.to_s.include?('_') }
+
     refute_empty bigram_terms, 'Should have bigram terms'
   end
 
@@ -257,7 +263,7 @@ def test_bigrams_only
 
     # Should only have bigrams (terms with underscore)
     tfidf.vocabulary.each_key do |term|
-      assert term.to_s.include?('_'), "Term #{term} should be a bigram"
+      assert_includes term.to_s, '_', "Term #{term} should be a bigram"
     end
   end
 
@@ -267,6 +273,7 @@ def test_trigrams
     tfidf.fit(['quick brown fox jumps', 'lazy brown dog runs'])
 
     trigram_terms = tfidf.vocabulary.keys.select { |t| t.to_s.count('_') == 2 }
+
     refute_empty trigram_terms, 'Should have trigram terms'
   end
 
@@ -325,6 +332,7 @@ def test_from_json_string
     # Transform should produce same results
     original_vec = tfidf.transform('Dogs are great')
     loaded_vec = loaded.transform('Dogs are great')
+
     assert_equal original_vec, loaded_vec
   end
 
@@ -352,7 +360,7 @@ def test_marshal_dump_load
     tfidf.fit(@corpus)
 
     dumped = Marshal.dump(tfidf)
-    loaded = Marshal.load(dumped) # rubocop:disable Security/MarshalLoad
+    loaded = Marshal.load(dumped)
 
     assert_predicate loaded, :fitted?
     assert_equal tfidf.vocabulary, loaded.vocabulary
@@ -361,6 +369,7 @@ def test_marshal_dump_load
     # Transform should produce same results
     original_vec = tfidf.transform('Dogs are great')
     loaded_vec = loaded.transform('Dogs are great')
+
     assert_equal original_vec, loaded_vec
   end
 

From 71d086ca2f755ff047fadc0c9b106985ee62a9cb Mon Sep 17 00:00:00 2001
From: Lucas Carlson <lucas@carlson.net>
Date: Sun, 28 Dec 2025 19:28:13 -0800
Subject: [PATCH 4/7] refactor(tfidf): use early return in extract_terms

---
 lib/classifier/tfidf.rb | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/lib/classifier/tfidf.rb b/lib/classifier/tfidf.rb
index 7e12752..f3329c5 100644
--- a/lib/classifier/tfidf.rb
+++ b/lib/classifier/tfidf.rb
@@ -180,13 +180,13 @@ def extract_terms(document)
         word_hash.each { |term, count| result[term] += count }
       end
 
-      if @ngram_range[1] > 1
-        tokens = tokenize_for_ngrams(document)
-        (2..@ngram_range[1]).each do |n|
-          next if n < @ngram_range[0]
+      return result if @ngram_range[1] <= 1
 
-          generate_ngrams(tokens, n).each { |ngram| result[ngram] += 1 }
-        end
+      tokens = tokenize_for_ngrams(document)
+      (2..@ngram_range[1]).each do |n|
+        next if n < @ngram_range[0]
+
+        generate_ngrams(tokens, n).each { |ngram| result[ngram] += 1 }
       end
 
       result

From 7c05edf67f13dbef3c96ff3883a1d7207c10a35e Mon Sep 17 00:00:00 2001
From: Lucas Carlson <lucas@carlson.net>
Date: Sun, 28 Dec 2025 19:29:13 -0800
Subject: [PATCH 5/7] refactor(tfidf): simplify ngram validation with
 all?(Integer)

---
 lib/classifier/tfidf.rb | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/lib/classifier/tfidf.rb b/lib/classifier/tfidf.rb
index f3329c5..c032c28 100644
--- a/lib/classifier/tfidf.rb
+++ b/lib/classifier/tfidf.rb
@@ -237,9 +237,7 @@ def validate_df!(value, name)
     # @rbs (Array[Integer]) -> void
     def validate_ngram_range!(range)
       raise ArgumentError, 'ngram_range must be an array of two integers' unless range.is_a?(Array) && range.size == 2
-      raise ArgumentError, 'ngram_range values must be positive integers' unless range.all? do |v|
-        v.is_a?(Integer) && v.positive?
-      end
+      raise ArgumentError, 'ngram_range values must be positive integers' unless range.all?(Integer) && range.all?(&:positive?)
       raise ArgumentError, 'ngram_range[0] must be <= ngram_range[1]' if range[0] > range[1]
     end
 

From 38a65f8f3a189905394a30b9416bc525b3700b87 Mon Sep 17 00:00:00 2001
From: Lucas Carlson <lucas@carlson.net>
Date: Sun, 28 Dec 2025 19:30:34 -0800
Subject: [PATCH 6/7] refactor(tfidf): inline single-use variables in
 within_df_bounds?

---
 lib/classifier/tfidf.rb | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/lib/classifier/tfidf.rb b/lib/classifier/tfidf.rb
index c032c28..425cbc7 100644
--- a/lib/classifier/tfidf.rb
+++ b/lib/classifier/tfidf.rb
@@ -211,10 +211,10 @@ def generate_ngrams(tokens, n) # rubocop:disable Naming/MethodParameterName
 
     # @rbs (Integer, Integer) -> bool
     def within_df_bounds?(doc_freq, num_docs)
-      min_count = @min_df.is_a?(Float) ? (@min_df * num_docs).ceil : @min_df
-      max_count = @max_df.is_a?(Float) ? (@max_df * num_docs).floor : @max_df
-
-      doc_freq.between?(min_count, max_count)
+      doc_freq.between?(
+        @min_df.is_a?(Float) ? (@min_df * num_docs).ceil : @min_df,
+        @max_df.is_a?(Float) ? (@max_df * num_docs).floor : @max_df
+      )
     end
 
     # @rbs (Hash[Symbol, Float]) -> Hash[Symbol, Float]

From 47c8f5067cca3eb8a979ffd0fc6ba56941f32c0b Mon Sep 17 00:00:00 2001
From: Lucas Carlson <lucas@carlson.net>
Date: Sun, 28 Dec 2025 19:37:47 -0800
Subject: [PATCH 7/7] fix(tfidf): fix line length in validate_ngram_range!

---
 lib/classifier/tfidf.rb | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/lib/classifier/tfidf.rb b/lib/classifier/tfidf.rb
index 425cbc7..f698476 100644
--- a/lib/classifier/tfidf.rb
+++ b/lib/classifier/tfidf.rb
@@ -237,7 +237,9 @@ def validate_df!(value, name)
     # @rbs (Array[Integer]) -> void
     def validate_ngram_range!(range)
       raise ArgumentError, 'ngram_range must be an array of two integers' unless range.is_a?(Array) && range.size == 2
-      raise ArgumentError, 'ngram_range values must be positive integers' unless range.all?(Integer) && range.all?(&:positive?)
+      unless range.all?(Integer) && range.all?(&:positive?)
+        raise ArgumentError, 'ngram_range values must be positive integers'
+      end
       raise ArgumentError, 'ngram_range[0] must be <= ngram_range[1]' if range[0] > range[1]
     end