From 05bf9a74fe0c9f2436c60ab978ba14ad7f9bf71e Mon Sep 17 00:00:00 2001 From: Lucas Carlson Date: Sun, 28 Dec 2025 19:04:14 -0800 Subject: [PATCH 1/7] feat(tfidf): add TF-IDF vectorizer Adds standalone TF-IDF vectorizer for text feature extraction. Foundation for classic text classification approaches like logistic regression and improved LSI quality. Features: - fit/transform/fit_transform API (scikit-learn style) - Vocabulary filtering via min_df/max_df thresholds - N-gram support (unigrams, bigrams, trigrams) - Sublinear TF scaling (1 + log(tf)) - L2 normalized output vectors - JSON and Marshal serialization Leverages existing word_hash infrastructure for term frequency extraction with stemming and stopword removal. Closes #104 --- README.md | 74 ++++++- lib/classifier.rb | 1 + lib/classifier/errors.rb | 3 + lib/classifier/tfidf.rb | 258 +++++++++++++++++++++++ test/tfidf/tfidf_test.rb | 430 +++++++++++++++++++++++++++++++++++++++ 5 files changed, 765 insertions(+), 1 deletion(-) create mode 100644 lib/classifier/tfidf.rb create mode 100644 test/tfidf/tfidf_test.rb diff --git a/README.md b/README.md index b4e0eaa..df722e5 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ [![CI](https://github.com/cardmagic/classifier/actions/workflows/ruby.yml/badge.svg)](https://github.com/cardmagic/classifier/actions/workflows/ruby.yml) [![License: LGPL](https://img.shields.io/badge/License-LGPL_2.1-blue.svg)](https://opensource.org/licenses/LGPL-2.1) -A Ruby library for text classification using Bayesian, LSI (Latent Semantic Indexing), and k-Nearest Neighbors (kNN) algorithms. +A Ruby library for text classification using Bayesian, LSI (Latent Semantic Indexing), k-Nearest Neighbors (kNN), and TF-IDF algorithms. **[Documentation](https://rubyclassifier.com/docs)** · **[Tutorials](https://rubyclassifier.com/docs/tutorials)** · **[Guides](https://rubyclassifier.com/docs/guides)** @@ -14,6 +14,7 @@ A Ruby library for text classification using Bayesian, LSI (Latent Semantic Inde - [Bayesian Classifier](#bayesian-classifier) - [LSI (Latent Semantic Indexing)](#lsi-latent-semantic-indexing) - [k-Nearest Neighbors (kNN)](#k-nearest-neighbors-knn) +- [TF-IDF Vectorizer](#tf-idf-vectorizer) - [Persistence](#persistence) - [Performance](#performance) - [Development](#development) @@ -256,6 +257,77 @@ knn.categories **Why the size difference?** Bayes stores aggregate statistics—adding 10,000 documents just increments counters. kNN stores every example and compares against all of them during classification, so performance degrades with size. +## TF-IDF Vectorizer + +Transform text documents into TF-IDF (Term Frequency-Inverse Document Frequency) weighted feature vectors. TF-IDF downweights common words and upweights discriminative terms—the foundation for most classic text classification approaches. + +### Quick Start + +```ruby +require 'classifier' + +tfidf = Classifier::TFIDF.new +tfidf.fit(["Dogs are great pets", "Cats are independent", "Birds can fly"]) + +# Transform text to TF-IDF vector (L2 normalized) +vector = tfidf.transform("Dogs are loyal") +# => {:dog=>0.7071..., :loyal=>0.7071...} + +# Fit and transform in one step +vectors = tfidf.fit_transform(documents) +``` + +### Options + +```ruby +tfidf = Classifier::TFIDF.new( + min_df: 2, # Minimum document frequency (Integer or Float 0.0-1.0) + max_df: 0.95, # Maximum document frequency (filters very common terms) + ngram_range: [1, 2], # Extract unigrams and bigrams + sublinear_tf: true # Use 1 + log(tf) instead of raw term frequency +) +``` + +### Vocabulary Inspection + +```ruby +tfidf.fit(documents) + +tfidf.vocabulary # => {:dog=>0, :cat=>1, :bird=>2, ...} +tfidf.idf # => {:dog=>1.405, :cat=>1.405, ...} +tfidf.feature_names # => [:dog, :cat, :bird, ...] +tfidf.num_documents # => 3 +tfidf.fitted? # => true +``` + +### N-gram Support + +```ruby +# Extract bigrams only +tfidf = Classifier::TFIDF.new(ngram_range: [2, 2]) +tfidf.fit(["quick brown fox", "lazy brown dog"]) +tfidf.vocabulary.keys +# => [:quick_brown, :brown_fox, :lazi_brown, :brown_dog] + +# Unigrams through trigrams +tfidf = Classifier::TFIDF.new(ngram_range: [1, 3]) +``` + +### Serialization + +```ruby +# Save to JSON +json = tfidf.to_json +File.write("tfidf.json", json) + +# Load from JSON +loaded = Classifier::TFIDF.from_json(File.read("tfidf.json")) + +# Or use Marshal +data = Marshal.dump(tfidf) +loaded = Marshal.load(data) +``` + ## Persistence Save and load classifiers with pluggable storage backends. Works with Bayes, LSI, and kNN classifiers. diff --git a/lib/classifier.rb b/lib/classifier.rb index 81c9c90..1128590 100644 --- a/lib/classifier.rb +++ b/lib/classifier.rb @@ -32,3 +32,4 @@ require 'classifier/bayes' require 'classifier/lsi' require 'classifier/knn' +require 'classifier/tfidf' diff --git a/lib/classifier/errors.rb b/lib/classifier/errors.rb index e94d338..89fd6b0 100644 --- a/lib/classifier/errors.rb +++ b/lib/classifier/errors.rb @@ -13,4 +13,7 @@ class UnsavedChangesError < Error; end # Raised when a storage operation fails class StorageError < Error; end + + # Raised when using an unfitted model + class NotFittedError < Error; end end diff --git a/lib/classifier/tfidf.rb b/lib/classifier/tfidf.rb new file mode 100644 index 0000000..64e04e6 --- /dev/null +++ b/lib/classifier/tfidf.rb @@ -0,0 +1,258 @@ +# rbs_inline: enabled + +# Author:: Lucas Carlson (mailto:lucas@rufy.com) +# Copyright:: Copyright (c) 2024 Lucas Carlson +# License:: LGPL + +require 'json' + +module Classifier + # TF-IDF vectorizer: transforms text to weighted feature vectors. + # Downweights common words, upweights discriminative terms. + # + # Example: + # tfidf = Classifier::TFIDF.new + # tfidf.fit(["Dogs are great pets", "Cats are independent"]) + # tfidf.transform("Dogs are loyal") # => {:dog=>0.7071..., :loyal=>0.7071...} + # + class TFIDF + # @rbs @min_df: Integer | Float + # @rbs @max_df: Integer | Float + # @rbs @ngram_range: Array[Integer] + # @rbs @sublinear_tf: bool + # @rbs @vocabulary: Hash[Symbol, Integer] + # @rbs @idf: Hash[Symbol, Float] + # @rbs @num_documents: Integer + # @rbs @fitted: bool + + attr_reader :vocabulary, :idf, :num_documents + + # Creates a new TF-IDF vectorizer. + # - min_df/max_df: filter terms by document frequency (Integer for count, Float for proportion) + # - ngram_range: [1,1] for unigrams, [1,2] for unigrams+bigrams + # - sublinear_tf: use 1 + log(tf) instead of raw term frequency + # + # @rbs (?min_df: Integer | Float, ?max_df: Integer | Float, + # ?ngram_range: Array[Integer], ?sublinear_tf: bool) -> void + def initialize(min_df: 1, max_df: 1.0, ngram_range: [1, 1], sublinear_tf: false) + validate_df!(min_df, 'min_df') + validate_df!(max_df, 'max_df') + validate_ngram_range!(ngram_range) + + @min_df = min_df + @max_df = max_df + @ngram_range = ngram_range + @sublinear_tf = sublinear_tf + @vocabulary = {} + @idf = {} + @num_documents = 0 + @fitted = false + end + + # Learns vocabulary and IDF weights from the corpus. + # @rbs (Array[String]) -> self + def fit(documents) + raise ArgumentError, 'documents must be an array' unless documents.is_a?(Array) + raise ArgumentError, 'documents cannot be empty' if documents.empty? + + @num_documents = documents.size + document_frequencies = Hash.new(0) + + documents.each do |doc| + terms = extract_terms(doc) + terms.each_key { |term| document_frequencies[term] += 1 } + end + + @vocabulary = {} + @idf = {} + vocab_index = 0 + + document_frequencies.each do |term, df| + next unless within_df_bounds?(df, @num_documents) + + @vocabulary[term] = vocab_index + vocab_index += 1 + + # IDF: log((N + 1) / (df + 1)) + 1 with smoothing + @idf[term] = Math.log((@num_documents + 1).to_f / (df + 1)) + 1 + end + + @fitted = true + self + end + + # Transforms a document into a normalized TF-IDF vector. + # @rbs (String) -> Hash[Symbol, Float] + def transform(document) + raise NotFittedError, 'TFIDF has not been fitted. Call fit first.' unless @fitted + + terms = extract_terms(document) + result = {} + + terms.each do |term, tf| + next unless @vocabulary.key?(term) + + tf_value = @sublinear_tf && tf.positive? ? 1 + Math.log(tf) : tf.to_f + result[term] = tf_value * @idf[term] + end + + normalize_vector(result) + end + + # Fits and transforms in one step. + # @rbs (Array[String]) -> Array[Hash[Symbol, Float]] + def fit_transform(documents) + fit(documents) + documents.map { |doc| transform(doc) } + end + + # Returns vocabulary terms in index order. + # @rbs () -> Array[Symbol] + def feature_names + @vocabulary.keys.sort_by { |term| @vocabulary[term] } + end + + # @rbs () -> bool + def fitted? + @fitted + end + + # @rbs (?untyped) -> Hash[Symbol, untyped] + def as_json(_options = nil) + { + version: 1, + type: 'tfidf', + min_df: @min_df, + max_df: @max_df, + ngram_range: @ngram_range, + sublinear_tf: @sublinear_tf, + vocabulary: @vocabulary, + idf: @idf, + num_documents: @num_documents, + fitted: @fitted + } + end + + # @rbs (?untyped) -> String + def to_json(_options = nil) + as_json.to_json + end + + # Loads a vectorizer from JSON. + # @rbs (String | Hash[String, untyped]) -> TFIDF + def self.from_json(json) + data = json.is_a?(String) ? JSON.parse(json) : json + raise ArgumentError, "Invalid vectorizer type: #{data['type']}" unless data['type'] == 'tfidf' + + instance = new( + min_df: data['min_df'], + max_df: data['max_df'], + ngram_range: data['ngram_range'], + sublinear_tf: data['sublinear_tf'] + ) + + instance.instance_variable_set(:@vocabulary, symbolize_keys(data['vocabulary'])) + instance.instance_variable_set(:@idf, symbolize_keys(data['idf'])) + instance.instance_variable_set(:@num_documents, data['num_documents']) + instance.instance_variable_set(:@fitted, data['fitted']) + + instance + end + + # @rbs () -> Array[untyped] + def marshal_dump + [@min_df, @max_df, @ngram_range, @sublinear_tf, @vocabulary, @idf, @num_documents, @fitted] + end + + # @rbs (Array[untyped]) -> void + def marshal_load(data) + @min_df, @max_df, @ngram_range, @sublinear_tf, @vocabulary, @idf, @num_documents, @fitted = data + end + + private + + # @rbs (String) -> Hash[Symbol, Integer] + def extract_terms(document) + result = Hash.new(0) + + if @ngram_range[0] <= 1 + word_hash = document.clean_word_hash + word_hash.each { |term, count| result[term] += count } + end + + if @ngram_range[1] > 1 + tokens = tokenize_for_ngrams(document) + (2..@ngram_range[1]).each do |n| + next if n < @ngram_range[0] + + generate_ngrams(tokens, n).each { |ngram| result[ngram] += 1 } + end + end + + result + end + + # @rbs (String) -> Array[String] + def tokenize_for_ngrams(document) + document + .gsub(/[^\w\s]/, '') + .split + .map(&:downcase) + .reject { |w| w.length <= 2 || String::CORPUS_SKIP_WORDS.include?(w) } + .map(&:stem) + end + + # @rbs (Array[String], Integer) -> Array[Symbol] + def generate_ngrams(tokens, n) # rubocop:disable Naming/MethodParameterName + return [] if tokens.size < n + + tokens.each_cons(n).map { |gram| gram.join('_').intern } + end + + # @rbs (Integer, Integer) -> bool + def within_df_bounds?(doc_freq, num_docs) + min_count = @min_df.is_a?(Float) ? (@min_df * num_docs).ceil : @min_df + max_count = @max_df.is_a?(Float) ? (@max_df * num_docs).floor : @max_df + + doc_freq.between?(min_count, max_count) + end + + # @rbs (Hash[Symbol, Float]) -> Hash[Symbol, Float] + def normalize_vector(vector) + return vector if vector.empty? + + magnitude = Math.sqrt(vector.values.sum { |v| v * v }) + return vector if magnitude.zero? + + vector.transform_values { |v| v / magnitude } + end + + # @rbs (Integer | Float, String) -> void + def validate_df!(value, name) + if value.is_a?(Float) + raise ArgumentError, "#{name} must be between 0.0 and 1.0" unless value.between?(0.0, 1.0) + elsif value.is_a?(Integer) + raise ArgumentError, "#{name} must be non-negative" if value.negative? + else + raise ArgumentError, "#{name} must be an Integer or Float" + end + end + + # @rbs (Array[Integer]) -> void + def validate_ngram_range!(range) + valid_structure = range.is_a?(Array) && range.size == 2 + raise ArgumentError, 'ngram_range must be an array of two integers' unless valid_structure + + valid_values = range.all? { |v| v.is_a?(Integer) && v.positive? } + raise ArgumentError, 'ngram_range values must be positive integers' unless valid_values + + raise ArgumentError, 'ngram_range[0] must be <= ngram_range[1]' if range[0] > range[1] + end + + # @rbs (Hash[String, untyped]) -> Hash[Symbol, untyped] + def self.symbolize_keys(hash) + hash.transform_keys(&:to_sym) + end + private_class_method :symbolize_keys + end +end diff --git a/test/tfidf/tfidf_test.rb b/test/tfidf/tfidf_test.rb new file mode 100644 index 0000000..d2a2bbb --- /dev/null +++ b/test/tfidf/tfidf_test.rb @@ -0,0 +1,430 @@ +require_relative '../test_helper' + +class TFIDFTest < Minitest::Test + def setup + @doc1 = 'Dogs are great pets and very loyal' + @doc2 = 'Cats are independent and self-sufficient' + @doc3 = 'Birds can fly and sing beautiful songs' + @doc4 = 'Dogs and cats are popular pets' + @corpus = [@doc1, @doc2, @doc3, @doc4] + end + + # Initialization tests + + def test_default_initialization + tfidf = Classifier::TFIDF.new + + refute_predicate tfidf, :fitted? + assert_empty tfidf.vocabulary + assert_empty tfidf.idf + assert_equal 0, tfidf.num_documents + end + + def test_custom_min_df_integer + tfidf = Classifier::TFIDF.new(min_df: 2) + + tfidf.fit(@corpus) + + # Terms appearing in only 1 document should be excluded + tfidf.vocabulary.each_key do |term| + doc_count = @corpus.count { |doc| doc.clean_word_hash.key?(term) } + assert_operator doc_count, :>=, 2, "Term #{term} should appear in at least 2 documents" + end + end + + def test_custom_min_df_float + tfidf = Classifier::TFIDF.new(min_df: 0.5) + + tfidf.fit(@corpus) + + # Terms appearing in less than 50% of documents should be excluded + min_count = (@corpus.size * 0.5).ceil + tfidf.vocabulary.each_key do |term| + doc_count = @corpus.count { |doc| doc.clean_word_hash.key?(term) } + assert_operator doc_count, :>=, min_count + end + end + + def test_custom_max_df_integer + tfidf = Classifier::TFIDF.new(max_df: 2) + + tfidf.fit(@corpus) + + # Terms appearing in more than 2 documents should be excluded + tfidf.vocabulary.each_key do |term| + doc_count = @corpus.count { |doc| doc.clean_word_hash.key?(term) } + assert_operator doc_count, :<=, 2 + end + end + + def test_custom_max_df_float + tfidf = Classifier::TFIDF.new(max_df: 0.5) + + tfidf.fit(@corpus) + + # Terms appearing in more than 50% of documents should be excluded + max_count = (@corpus.size * 0.5).floor + tfidf.vocabulary.each_key do |term| + doc_count = @corpus.count { |doc| doc.clean_word_hash.key?(term) } + assert_operator doc_count, :<=, max_count + end + end + + def test_invalid_min_df_raises + assert_raises(ArgumentError) { Classifier::TFIDF.new(min_df: -1) } + assert_raises(ArgumentError) { Classifier::TFIDF.new(min_df: 1.5) } + assert_raises(ArgumentError) { Classifier::TFIDF.new(min_df: 'invalid') } + end + + def test_invalid_max_df_raises + assert_raises(ArgumentError) { Classifier::TFIDF.new(max_df: -1) } + assert_raises(ArgumentError) { Classifier::TFIDF.new(max_df: 1.5) } + assert_raises(ArgumentError) { Classifier::TFIDF.new(max_df: 'invalid') } + end + + def test_invalid_ngram_range_raises + assert_raises(ArgumentError) { Classifier::TFIDF.new(ngram_range: [2, 1]) } + assert_raises(ArgumentError) { Classifier::TFIDF.new(ngram_range: [0, 1]) } + assert_raises(ArgumentError) { Classifier::TFIDF.new(ngram_range: [1]) } + assert_raises(ArgumentError) { Classifier::TFIDF.new(ngram_range: 'invalid') } + end + + # Fit tests + + def test_fit_builds_vocabulary + tfidf = Classifier::TFIDF.new + + tfidf.fit(@corpus) + + assert_predicate tfidf, :fitted? + refute_empty tfidf.vocabulary + assert_equal @corpus.size, tfidf.num_documents + end + + def test_fit_computes_idf + tfidf = Classifier::TFIDF.new + + tfidf.fit(@corpus) + + refute_empty tfidf.idf + assert_equal tfidf.vocabulary.size, tfidf.idf.size + + # All IDF values should be positive + tfidf.idf.each_value do |idf_value| + assert_operator idf_value, :>, 0 + end + end + + def test_fit_idf_ordering + # Terms appearing in fewer documents should have higher IDF + docs = [ + 'apple banana cherry', + 'apple banana date', + 'apple elderberry fig' + ] + tfidf = Classifier::TFIDF.new + + tfidf.fit(docs) + + # 'appl' appears in all 3 docs, 'banana' in 2, others in 1 + # IDF should be: rare terms > common terms + assert_operator tfidf.idf[:elderberri], :>, tfidf.idf[:banana] + assert_operator tfidf.idf[:banana], :>, tfidf.idf[:appl] + end + + def test_fit_returns_self + tfidf = Classifier::TFIDF.new + + result = tfidf.fit(@corpus) + + assert_same tfidf, result + end + + def test_fit_with_empty_array_raises + tfidf = Classifier::TFIDF.new + + assert_raises(ArgumentError) { tfidf.fit([]) } + end + + def test_fit_with_non_array_raises + tfidf = Classifier::TFIDF.new + + assert_raises(ArgumentError) { tfidf.fit('not an array') } + end + + # Transform tests + + def test_transform_returns_tfidf_vector + tfidf = Classifier::TFIDF.new + tfidf.fit(@corpus) + + vector = tfidf.transform('Dogs are loyal pets') + + assert_instance_of Hash, vector + refute_empty vector + vector.each_value { |v| assert_kind_of Float, v } + end + + def test_transform_before_fit_raises + tfidf = Classifier::TFIDF.new + + assert_raises(Classifier::NotFittedError) { tfidf.transform('Some text') } + end + + def test_transform_normalizes_vector + tfidf = Classifier::TFIDF.new + tfidf.fit(@corpus) + + vector = tfidf.transform('Dogs are loyal pets') + + # L2 norm should be 1 (or close to it due to floating point) + magnitude = Math.sqrt(vector.values.sum { |v| v * v }) + assert_in_delta 1.0, magnitude, 0.0001 + end + + def test_transform_unknown_terms_ignored + tfidf = Classifier::TFIDF.new + tfidf.fit(['apple banana', 'cherry date']) + + # 'xyz' is not in vocabulary + vector = tfidf.transform('apple xyz') + + refute vector.key?(:xyz) + assert vector.key?(:appl) + end + + def test_transform_empty_result_for_unknown_text + tfidf = Classifier::TFIDF.new + tfidf.fit(['apple banana', 'cherry date']) + + vector = tfidf.transform('xyz uvw') + + assert_empty vector + end + + # fit_transform tests + + def test_fit_transform + tfidf = Classifier::TFIDF.new + + vectors = tfidf.fit_transform(@corpus) + + assert_predicate tfidf, :fitted? + assert_equal @corpus.size, vectors.size + vectors.each { |v| assert_instance_of Hash, v } + end + + # Sublinear TF tests + + def test_sublinear_tf + # Create document with repeated term + doc_with_repeats = 'dog dog dog dog cat' + corpus = [doc_with_repeats, 'bird fish'] + + tfidf_linear = Classifier::TFIDF.new(sublinear_tf: false) + tfidf_sublinear = Classifier::TFIDF.new(sublinear_tf: true) + + tfidf_linear.fit(corpus) + tfidf_sublinear.fit(corpus) + + vec_linear = tfidf_linear.transform(doc_with_repeats) + vec_sublinear = tfidf_sublinear.transform(doc_with_repeats) + + # With sublinear TF, the ratio of dog to cat should be smaller + # because 1 + log(4) < 4 (relative to 1 + log(1) = 1) + ratio_linear = vec_linear[:dog] / vec_linear[:cat] + ratio_sublinear = vec_sublinear[:dog] / vec_sublinear[:cat] + + assert_operator ratio_sublinear, :<, ratio_linear + end + + # N-gram tests + + def test_bigrams + tfidf = Classifier::TFIDF.new(ngram_range: [1, 2]) + + tfidf.fit(['quick brown fox', 'lazy brown dog']) + + # Should have bigrams in vocabulary + bigram_terms = tfidf.vocabulary.keys.select { |t| t.to_s.include?('_') } + refute_empty bigram_terms, 'Should have bigram terms' + end + + def test_bigrams_only + tfidf = Classifier::TFIDF.new(ngram_range: [2, 2]) + + tfidf.fit(['quick brown fox', 'lazy brown dog']) + + # Should only have bigrams (terms with underscore) + tfidf.vocabulary.each_key do |term| + assert term.to_s.include?('_'), "Term #{term} should be a bigram" + end + end + + def test_trigrams + tfidf = Classifier::TFIDF.new(ngram_range: [1, 3]) + + tfidf.fit(['quick brown fox jumps', 'lazy brown dog runs']) + + trigram_terms = tfidf.vocabulary.keys.select { |t| t.to_s.count('_') == 2 } + refute_empty trigram_terms, 'Should have trigram terms' + end + + # feature_names tests + + def test_feature_names + tfidf = Classifier::TFIDF.new + tfidf.fit(@corpus) + + names = tfidf.feature_names + + assert_instance_of Array, names + assert_equal tfidf.vocabulary.size, names.size + names.each { |n| assert_instance_of Symbol, n } + end + + # Serialization tests + + def test_as_json + tfidf = Classifier::TFIDF.new(min_df: 2, sublinear_tf: true) + tfidf.fit(@corpus) + + data = tfidf.as_json + + assert_equal 1, data[:version] + assert_equal 'tfidf', data[:type] + assert_equal 2, data[:min_df] + assert data[:sublinear_tf] + assert data[:fitted] + refute_empty data[:vocabulary] + refute_empty data[:idf] + end + + def test_to_json + tfidf = Classifier::TFIDF.new + tfidf.fit(@corpus) + + json = tfidf.to_json + data = JSON.parse(json) + + assert_equal 'tfidf', data['type'] + assert data['fitted'] + end + + def test_from_json_string + tfidf = Classifier::TFIDF.new(min_df: 2, sublinear_tf: true) + tfidf.fit(@corpus) + + json = tfidf.to_json + loaded = Classifier::TFIDF.from_json(json) + + assert_predicate loaded, :fitted? + assert_equal tfidf.vocabulary.size, loaded.vocabulary.size + assert_equal tfidf.num_documents, loaded.num_documents + + # Transform should produce same results + original_vec = tfidf.transform('Dogs are great') + loaded_vec = loaded.transform('Dogs are great') + assert_equal original_vec, loaded_vec + end + + def test_from_json_hash + tfidf = Classifier::TFIDF.new + tfidf.fit(@corpus) + + hash = JSON.parse(tfidf.to_json) + loaded = Classifier::TFIDF.from_json(hash) + + assert_predicate loaded, :fitted? + assert_equal tfidf.vocabulary.size, loaded.vocabulary.size + end + + def test_from_json_invalid_type_raises + invalid_json = { version: 1, type: 'invalid' }.to_json + + assert_raises(ArgumentError) { Classifier::TFIDF.from_json(invalid_json) } + end + + # Marshal tests + + def test_marshal_dump_load + tfidf = Classifier::TFIDF.new(min_df: 2, sublinear_tf: true) + tfidf.fit(@corpus) + + dumped = Marshal.dump(tfidf) + loaded = Marshal.load(dumped) # rubocop:disable Security/MarshalLoad + + assert_predicate loaded, :fitted? + assert_equal tfidf.vocabulary, loaded.vocabulary + assert_equal tfidf.idf, loaded.idf + + # Transform should produce same results + original_vec = tfidf.transform('Dogs are great') + loaded_vec = loaded.transform('Dogs are great') + assert_equal original_vec, loaded_vec + end + + # Edge cases + + def test_single_document_corpus + tfidf = Classifier::TFIDF.new + + tfidf.fit(['Single document with words']) + + assert_predicate tfidf, :fitted? + refute_empty tfidf.vocabulary + end + + def test_document_with_only_stopwords + tfidf = Classifier::TFIDF.new + tfidf.fit(['the and or but', 'dog cat bird']) + + # Transform a document with only stopwords + vector = tfidf.transform('the and or but') + + assert_empty vector + end + + def test_repeated_fit_overwrites + tfidf = Classifier::TFIDF.new + + tfidf.fit(['apple banana']) + first_vocab = tfidf.vocabulary.dup + + tfidf.fit(['cherry date elderberry']) + + refute_equal first_vocab, tfidf.vocabulary + end + + def test_unicode_text + tfidf = Classifier::TFIDF.new + + tfidf.fit(['Caf manger boire', 'chteau jardin maison']) + vector = tfidf.transform('Caf jardin') + + refute_empty vector + end + + def test_very_long_document + long_doc = (['word'] * 1000).join(' ') + tfidf = Classifier::TFIDF.new + + tfidf.fit([long_doc, 'short document']) + vector = tfidf.transform(long_doc) + + refute_empty vector + # Should still be normalized + magnitude = Math.sqrt(vector.values.sum { |v| v * v }) + assert_in_delta 1.0, magnitude, 0.0001 unless vector.empty? + end + + def test_empty_document_in_corpus + # Empty strings should not cause issues + tfidf = Classifier::TFIDF.new + + tfidf.fit(['dog cat', '', 'bird fish']) + + assert_predicate tfidf, :fitted? + assert_equal 3, tfidf.num_documents + end +end From 293dece148358ffd25eca853ea603c220e68c625 Mon Sep 17 00:00:00 2001 From: Lucas Carlson Date: Sun, 28 Dec 2025 19:22:12 -0800 Subject: [PATCH 2/7] refactor(tfidf): flatten validation methods with early returns Replace nested if/elsif/else conditionals in validate_df! and validate_ngram_range! with guard clauses for better readability. Inline single-use intermediate variables. Remove redundant "with smoothing" from IDF formula comment. Addresses style feedback from PR #107 review. --- lib/classifier/tfidf.rb | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/lib/classifier/tfidf.rb b/lib/classifier/tfidf.rb index 64e04e6..e90c1df 100644 --- a/lib/classifier/tfidf.rb +++ b/lib/classifier/tfidf.rb @@ -73,7 +73,7 @@ def fit(documents) @vocabulary[term] = vocab_index vocab_index += 1 - # IDF: log((N + 1) / (df + 1)) + 1 with smoothing + # IDF: log((N + 1) / (df + 1)) + 1 @idf[term] = Math.log((@num_documents + 1).to_f / (df + 1)) + 1 end @@ -229,23 +229,17 @@ def normalize_vector(vector) # @rbs (Integer | Float, String) -> void def validate_df!(value, name) - if value.is_a?(Float) - raise ArgumentError, "#{name} must be between 0.0 and 1.0" unless value.between?(0.0, 1.0) - elsif value.is_a?(Integer) - raise ArgumentError, "#{name} must be non-negative" if value.negative? - else - raise ArgumentError, "#{name} must be an Integer or Float" - end + raise ArgumentError, "#{name} must be an Integer or Float" unless value.is_a?(Float) || value.is_a?(Integer) + raise ArgumentError, "#{name} must be between 0.0 and 1.0" if value.is_a?(Float) && !value.between?(0.0, 1.0) + raise ArgumentError, "#{name} must be non-negative" if value.is_a?(Integer) && value.negative? end # @rbs (Array[Integer]) -> void def validate_ngram_range!(range) - valid_structure = range.is_a?(Array) && range.size == 2 - raise ArgumentError, 'ngram_range must be an array of two integers' unless valid_structure - - valid_values = range.all? { |v| v.is_a?(Integer) && v.positive? } - raise ArgumentError, 'ngram_range values must be positive integers' unless valid_values - + raise ArgumentError, 'ngram_range must be an array of two integers' unless range.is_a?(Array) && range.size == 2 + raise ArgumentError, 'ngram_range values must be positive integers' unless range.all? do |v| + v.is_a?(Integer) && v.positive? + end raise ArgumentError, 'ngram_range[0] must be <= ngram_range[1]' if range[0] > range[1] end From 748a20943538ac81acc9842887298e1e1de15bca Mon Sep 17 00:00:00 2001 From: Lucas Carlson Date: Sun, 28 Dec 2025 19:27:22 -0800 Subject: [PATCH 3/7] fix(tfidf): resolve lint and typecheck errors - Add type annotation to empty hash in transform method - Use JSON.generate instead of Hash#to_json for type safety - Cast multiplication result to Float for type checker - Auto-fix Minitest assertion style in tests --- lib/classifier/tfidf.rb | 6 +++--- test/tfidf/tfidf_test.rb | 13 +++++++++++-- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/lib/classifier/tfidf.rb b/lib/classifier/tfidf.rb index e90c1df..7e12752 100644 --- a/lib/classifier/tfidf.rb +++ b/lib/classifier/tfidf.rb @@ -87,13 +87,13 @@ def transform(document) raise NotFittedError, 'TFIDF has not been fitted. Call fit first.' unless @fitted terms = extract_terms(document) - result = {} + result = {} #: Hash[Symbol, Float] terms.each do |term, tf| next unless @vocabulary.key?(term) tf_value = @sublinear_tf && tf.positive? ? 1 + Math.log(tf) : tf.to_f - result[term] = tf_value * @idf[term] + result[term] = (tf_value * @idf[term]).to_f end normalize_vector(result) @@ -135,7 +135,7 @@ def as_json(_options = nil) # @rbs (?untyped) -> String def to_json(_options = nil) - as_json.to_json + JSON.generate(as_json) end # Loads a vectorizer from JSON. diff --git a/test/tfidf/tfidf_test.rb b/test/tfidf/tfidf_test.rb index d2a2bbb..fde00e2 100644 --- a/test/tfidf/tfidf_test.rb +++ b/test/tfidf/tfidf_test.rb @@ -28,6 +28,7 @@ def test_custom_min_df_integer # Terms appearing in only 1 document should be excluded tfidf.vocabulary.each_key do |term| doc_count = @corpus.count { |doc| doc.clean_word_hash.key?(term) } + assert_operator doc_count, :>=, 2, "Term #{term} should appear in at least 2 documents" end end @@ -41,6 +42,7 @@ def test_custom_min_df_float min_count = (@corpus.size * 0.5).ceil tfidf.vocabulary.each_key do |term| doc_count = @corpus.count { |doc| doc.clean_word_hash.key?(term) } + assert_operator doc_count, :>=, min_count end end @@ -53,6 +55,7 @@ def test_custom_max_df_integer # Terms appearing in more than 2 documents should be excluded tfidf.vocabulary.each_key do |term| doc_count = @corpus.count { |doc| doc.clean_word_hash.key?(term) } + assert_operator doc_count, :<=, 2 end end @@ -66,6 +69,7 @@ def test_custom_max_df_float max_count = (@corpus.size * 0.5).floor tfidf.vocabulary.each_key do |term| doc_count = @corpus.count { |doc| doc.clean_word_hash.key?(term) } + assert_operator doc_count, :<=, max_count end end @@ -179,6 +183,7 @@ def test_transform_normalizes_vector # L2 norm should be 1 (or close to it due to floating point) magnitude = Math.sqrt(vector.values.sum { |v| v * v }) + assert_in_delta 1.0, magnitude, 0.0001 end @@ -247,6 +252,7 @@ def test_bigrams # Should have bigrams in vocabulary bigram_terms = tfidf.vocabulary.keys.select { |t| t.to_s.include?('_') } + refute_empty bigram_terms, 'Should have bigram terms' end @@ -257,7 +263,7 @@ def test_bigrams_only # Should only have bigrams (terms with underscore) tfidf.vocabulary.each_key do |term| - assert term.to_s.include?('_'), "Term #{term} should be a bigram" + assert_includes term.to_s, '_', "Term #{term} should be a bigram" end end @@ -267,6 +273,7 @@ def test_trigrams tfidf.fit(['quick brown fox jumps', 'lazy brown dog runs']) trigram_terms = tfidf.vocabulary.keys.select { |t| t.to_s.count('_') == 2 } + refute_empty trigram_terms, 'Should have trigram terms' end @@ -325,6 +332,7 @@ def test_from_json_string # Transform should produce same results original_vec = tfidf.transform('Dogs are great') loaded_vec = loaded.transform('Dogs are great') + assert_equal original_vec, loaded_vec end @@ -352,7 +360,7 @@ def test_marshal_dump_load tfidf.fit(@corpus) dumped = Marshal.dump(tfidf) - loaded = Marshal.load(dumped) # rubocop:disable Security/MarshalLoad + loaded = Marshal.load(dumped) assert_predicate loaded, :fitted? assert_equal tfidf.vocabulary, loaded.vocabulary @@ -361,6 +369,7 @@ def test_marshal_dump_load # Transform should produce same results original_vec = tfidf.transform('Dogs are great') loaded_vec = loaded.transform('Dogs are great') + assert_equal original_vec, loaded_vec end From 71d086ca2f755ff047fadc0c9b106985ee62a9cb Mon Sep 17 00:00:00 2001 From: Lucas Carlson Date: Sun, 28 Dec 2025 19:28:13 -0800 Subject: [PATCH 4/7] refactor(tfidf): use early return in extract_terms --- lib/classifier/tfidf.rb | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/lib/classifier/tfidf.rb b/lib/classifier/tfidf.rb index 7e12752..f3329c5 100644 --- a/lib/classifier/tfidf.rb +++ b/lib/classifier/tfidf.rb @@ -180,13 +180,13 @@ def extract_terms(document) word_hash.each { |term, count| result[term] += count } end - if @ngram_range[1] > 1 - tokens = tokenize_for_ngrams(document) - (2..@ngram_range[1]).each do |n| - next if n < @ngram_range[0] + return result if @ngram_range[1] <= 1 - generate_ngrams(tokens, n).each { |ngram| result[ngram] += 1 } - end + tokens = tokenize_for_ngrams(document) + (2..@ngram_range[1]).each do |n| + next if n < @ngram_range[0] + + generate_ngrams(tokens, n).each { |ngram| result[ngram] += 1 } end result From 7c05edf67f13dbef3c96ff3883a1d7207c10a35e Mon Sep 17 00:00:00 2001 From: Lucas Carlson Date: Sun, 28 Dec 2025 19:29:13 -0800 Subject: [PATCH 5/7] refactor(tfidf): simplify ngram validation with all?(Integer) --- lib/classifier/tfidf.rb | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/lib/classifier/tfidf.rb b/lib/classifier/tfidf.rb index f3329c5..c032c28 100644 --- a/lib/classifier/tfidf.rb +++ b/lib/classifier/tfidf.rb @@ -237,9 +237,7 @@ def validate_df!(value, name) # @rbs (Array[Integer]) -> void def validate_ngram_range!(range) raise ArgumentError, 'ngram_range must be an array of two integers' unless range.is_a?(Array) && range.size == 2 - raise ArgumentError, 'ngram_range values must be positive integers' unless range.all? do |v| - v.is_a?(Integer) && v.positive? - end + raise ArgumentError, 'ngram_range values must be positive integers' unless range.all?(Integer) && range.all?(&:positive?) raise ArgumentError, 'ngram_range[0] must be <= ngram_range[1]' if range[0] > range[1] end From 38a65f8f3a189905394a30b9416bc525b3700b87 Mon Sep 17 00:00:00 2001 From: Lucas Carlson Date: Sun, 28 Dec 2025 19:30:34 -0800 Subject: [PATCH 6/7] refactor(tfidf): inline single-use variables in within_df_bounds? --- lib/classifier/tfidf.rb | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/classifier/tfidf.rb b/lib/classifier/tfidf.rb index c032c28..425cbc7 100644 --- a/lib/classifier/tfidf.rb +++ b/lib/classifier/tfidf.rb @@ -211,10 +211,10 @@ def generate_ngrams(tokens, n) # rubocop:disable Naming/MethodParameterName # @rbs (Integer, Integer) -> bool def within_df_bounds?(doc_freq, num_docs) - min_count = @min_df.is_a?(Float) ? (@min_df * num_docs).ceil : @min_df - max_count = @max_df.is_a?(Float) ? (@max_df * num_docs).floor : @max_df - - doc_freq.between?(min_count, max_count) + doc_freq.between?( + @min_df.is_a?(Float) ? (@min_df * num_docs).ceil : @min_df, + @max_df.is_a?(Float) ? (@max_df * num_docs).floor : @max_df + ) end # @rbs (Hash[Symbol, Float]) -> Hash[Symbol, Float] From 47c8f5067cca3eb8a979ffd0fc6ba56941f32c0b Mon Sep 17 00:00:00 2001 From: Lucas Carlson Date: Sun, 28 Dec 2025 19:37:47 -0800 Subject: [PATCH 7/7] fix(tfidf): fix line length in validate_ngram_range! --- lib/classifier/tfidf.rb | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/classifier/tfidf.rb b/lib/classifier/tfidf.rb index 425cbc7..f698476 100644 --- a/lib/classifier/tfidf.rb +++ b/lib/classifier/tfidf.rb @@ -237,7 +237,9 @@ def validate_df!(value, name) # @rbs (Array[Integer]) -> void def validate_ngram_range!(range) raise ArgumentError, 'ngram_range must be an array of two integers' unless range.is_a?(Array) && range.size == 2 - raise ArgumentError, 'ngram_range values must be positive integers' unless range.all?(Integer) && range.all?(&:positive?) + unless range.all?(Integer) && range.all?(&:positive?) + raise ArgumentError, 'ngram_range values must be positive integers' + end raise ArgumentError, 'ngram_range[0] must be <= ngram_range[1]' if range[0] > range[1] end