From 5517d2d8a3e0955f7969639d6e14c84f90ec3909 Mon Sep 17 00:00:00 2001 From: Marco Mastrodonato Date: Wed, 18 Jun 2025 11:36:56 +0200 Subject: [PATCH] Replace C extension with Rust using magnus --- Gemfile | 4 +- Gemfile.lock | 10 ++ ext/rag_embeddings/Cargo.toml | 11 ++ ext/rag_embeddings/embedding.c | 220 -------------------------------- ext/rag_embeddings/embedding.rs | 112 ++++++++++++++++ ext/rag_embeddings/extconf.rb | 4 +- rag_embeddings.gemspec | 5 +- 7 files changed, 143 insertions(+), 223 deletions(-) create mode 100644 ext/rag_embeddings/Cargo.toml delete mode 100644 ext/rag_embeddings/embedding.c create mode 100644 ext/rag_embeddings/embedding.rs diff --git a/Gemfile b/Gemfile index c8718b9..84c9197 100644 --- a/Gemfile +++ b/Gemfile @@ -12,4 +12,6 @@ gem "rubocop" gem "faraday" gem "rspec" gem "dotenv", require: false -gem "debug" \ No newline at end of file +gem "debug" +gem "rb_sys", "~> 0.9" +gem "rake-compiler", "~> 1.2" diff --git a/Gemfile.lock b/Gemfile.lock index 96a01ea..79d26c4 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -4,6 +4,7 @@ PATH rag_embeddings (0.2.2) faraday langchainrb + rb_sys (~> 0.9) sqlite3 GEM @@ -66,6 +67,11 @@ GEM racc (1.8.1) rainbow (3.1.1) rake (13.3.0) + rake-compiler (1.3.0) + rake + rake-compiler-dock (1.9.1) + rb_sys (0.9.116) + rake-compiler-dock (= 1.9.1) rdoc (6.14.0) erb psych (>= 4.0.0) @@ -101,6 +107,7 @@ GEM prism (~> 1.4) ruby-progressbar (1.13.0) sqlite3 (2.6.0-arm64-darwin) + sqlite3 (2.6.0-x86_64-linux-gnu) stringio (3.1.7) unicode-display_width (3.1.4) unicode-emoji (~> 4.0, >= 4.0.4) @@ -110,6 +117,7 @@ GEM PLATFORMS arm64-darwin + x86_64-linux DEPENDENCIES debug @@ -118,6 +126,8 @@ DEPENDENCIES langchainrb rag_embeddings! rake + rake-compiler (~> 1.2) + rb_sys (~> 0.9) rspec rubocop sqlite3 diff --git a/ext/rag_embeddings/Cargo.toml b/ext/rag_embeddings/Cargo.toml new file mode 100644 index 0000000..5d5d99e --- /dev/null +++ b/ext/rag_embeddings/Cargo.toml @@ -0,0 +1,11 @@ +[package] +name = "embedding" +version = "0.1.0" +edition = "2021" + +[lib] +crate-type = ["cdylib"] +path = "embedding.rs" + +[dependencies] +magnus = "0.7" diff --git a/ext/rag_embeddings/embedding.c b/ext/rag_embeddings/embedding.c deleted file mode 100644 index dc683a5..0000000 --- a/ext/rag_embeddings/embedding.c +++ /dev/null @@ -1,220 +0,0 @@ -#include // Ruby API -#include // For integer types like uint16_t -#include // For memory allocation functions -#include // For math functions like sqrt - -// Main data structure for storing embeddings -// Flexible array member (values[]) allows variable length arrays -typedef struct { - uint16_t dim; // Dimension of the embedding vector - float values[]; // Flexible array member to store the actual values -} embedding_t; - -// Callback for freeing memory when Ruby's GC collects our object -static void embedding_free(void *ptr) { - if (ptr) { - xfree(ptr); // Ruby's memory free function (with null check) - } -} - -// Callback to report memory usage to Ruby's GC -static size_t embedding_memsize(const void *ptr) { - const embedding_t *emb = (const embedding_t *)ptr; - return emb ? sizeof(embedding_t) + emb->dim * sizeof(float) : 0; -} - -// Type information for Ruby's GC: -// Tells Ruby how to manage our C data structure -static const rb_data_type_t embedding_type = { - "RagEmbeddings/Embedding", // Type name - {0, embedding_free, embedding_memsize,}, // Functions: mark, free, size - 0, 0, // Parent type, data - RUBY_TYPED_FREE_IMMEDIATELY // Flags for immediate cleanup -}; - -// Class method: RagEmbeddings::Embedding.from_array([1.0, 2.0, ...]) -// Creates a new embedding from a Ruby array -static VALUE embedding_from_array(VALUE klass, VALUE rb_array) { - Check_Type(rb_array, T_ARRAY); // Ensure argument is a Ruby array - - long array_len = RARRAY_LEN(rb_array); - - // Validate array length fits in uint16_t (max 65535 dimensions) - if (array_len > UINT16_MAX) { - rb_raise(rb_eArgError, "Array too large: maximum %d dimensions allowed", UINT16_MAX); - } - - // Prevent zero-length embeddings - if (array_len == 0) { - rb_raise(rb_eArgError, "Cannot create embedding from empty array"); - } - - uint16_t dim = (uint16_t)array_len; - - // Allocate memory for struct + array of floats - embedding_t *ptr = xmalloc(sizeof(embedding_t) + dim * sizeof(float)); - ptr->dim = dim; - - // Copy values from Ruby array to our C array - // Using RARRAY_CONST_PTR for better performance when available - const VALUE *array_ptr = RARRAY_CONST_PTR(rb_array); - for (uint16_t i = 0; i < dim; ++i) { - VALUE val = array_ptr[i]; - - // Ensure the value is numeric - if (!RB_FLOAT_TYPE_P(val) && !RB_INTEGER_TYPE_P(val)) { - xfree(ptr); // Clean up allocated memory before raising exception - rb_raise(rb_eTypeError, "Array element at index %d is not numeric", i); - } - - ptr->values[i] = (float)NUM2DBL(val); - } - - // Wrap our C struct in a Ruby object - VALUE obj = TypedData_Wrap_Struct(klass, &embedding_type, ptr); - return obj; -} - -// Instance method: embedding.dim -// Returns the dimension of the embedding -static VALUE embedding_dim(VALUE self) { - embedding_t *ptr; - // Get the C struct from the Ruby object - TypedData_Get_Struct(self, embedding_t, &embedding_type, ptr); - return INT2NUM(ptr->dim); -} - -// Instance method: embedding.to_a -// Converts the embedding back to a Ruby array -static VALUE embedding_to_a(VALUE self) { - embedding_t *ptr; - TypedData_Get_Struct(self, embedding_t, &embedding_type, ptr); - - // Create a new Ruby array with pre-allocated capacity - VALUE arr = rb_ary_new_capa(ptr->dim); - - // Copy each float value to the Ruby array - // Using rb_ary_store for better performance than rb_ary_push - for (uint16_t i = 0; i < ptr->dim; ++i) { - rb_ary_store(arr, i, DBL2NUM(ptr->values[i])); - } - - return arr; -} - -// Instance method: embedding.cosine_similarity(other_embedding) -// Calculate cosine similarity between two embeddings using optimized algorithm -static VALUE embedding_cosine_similarity(VALUE self, VALUE other) { - embedding_t *a, *b; - // Get C structs for both embeddings - TypedData_Get_Struct(self, embedding_t, &embedding_type, a); - TypedData_Get_Struct(other, embedding_t, &embedding_type, b); - - // Ensure dimensions match - if (a->dim != b->dim) { - rb_raise(rb_eArgError, "Dimension mismatch: %d vs %d", a->dim, b->dim); - } - - // Use double precision for intermediate calculations to reduce accumulation errors - double dot = 0.0, norm_a = 0.0, norm_b = 0.0; - - // Calculate dot product and vector magnitudes in a single loop - // This is more cache-friendly than separate loops - const float *va = a->values; - const float *vb = b->values; - - for (uint16_t i = 0; i < a->dim; ++i) { - float ai = va[i]; - float bi = vb[i]; - - dot += (double)ai * bi; // Dot product - norm_a += (double)ai * ai; // Square of magnitude for vector a - norm_b += (double)bi * bi; // Square of magnitude for vector b - } - - // Check for zero vectors to avoid division by zero - if (norm_a == 0.0 || norm_b == 0.0) { - return DBL2NUM(0.0); // Return 0 similarity for zero vectors - } - - // Apply cosine similarity formula: dot(a,b)/(|a|*|b|) - // Using sqrt for better numerical stability - double magnitude_product = sqrt(norm_a * norm_b); - double similarity = dot / magnitude_product; - - // Clamp result to [-1, 1] to handle floating point precision errors - if (similarity > 1.0) similarity = 1.0; - if (similarity < -1.0) similarity = -1.0; - - return DBL2NUM(similarity); -} - -// Instance method: embedding.magnitude -// Calculate the magnitude (L2 norm) of the embedding vector -static VALUE embedding_magnitude(VALUE self) { - embedding_t *ptr; - TypedData_Get_Struct(self, embedding_t, &embedding_type, ptr); - - double sum_squares = 0.0; - const float *values = ptr->values; - - for (uint16_t i = 0; i < ptr->dim; ++i) { - float val = values[i]; - sum_squares += (double)val * val; - } - - return DBL2NUM(sqrt(sum_squares)); -} - -// Instance method: embedding.normalize! -// Normalize the embedding vector in-place (destructive operation) -static VALUE embedding_normalize_bang(VALUE self) { - embedding_t *ptr; - TypedData_Get_Struct(self, embedding_t, &embedding_type, ptr); - - // Calculate magnitude - double sum_squares = 0.0; - float *values = ptr->values; - - for (uint16_t i = 0; i < ptr->dim; ++i) { - float val = values[i]; - sum_squares += (double)val * val; - } - - double magnitude = sqrt(sum_squares); - - // Avoid division by zero - if (magnitude == 0.0) { - rb_raise(rb_eZeroDivError, "Cannot normalize zero vector"); - } - - // Normalize each component - float inv_magnitude = (float)(1.0 / magnitude); - for (uint16_t i = 0; i < ptr->dim; ++i) { - values[i] *= inv_magnitude; - } - - return self; // Return self for method chaining -} - -// Ruby extension initialization function -// This function is called when the extension is loaded -void Init_embedding(void) { - // Define module and class - VALUE mRag = rb_define_module("RagEmbeddings"); - VALUE cEmbedding = rb_define_class_under(mRag, "Embedding", rb_cObject); - - // IMPORTANT: Undefine the default allocator to prevent the warning - // This is necessary when using TypedData_Wrap_Struct - rb_undef_alloc_func(cEmbedding); - - // Register class methods - rb_define_singleton_method(cEmbedding, "from_array", embedding_from_array, 1); - - // Register instance methods - rb_define_method(cEmbedding, "dim", embedding_dim, 0); - rb_define_method(cEmbedding, "to_a", embedding_to_a, 0); - rb_define_method(cEmbedding, "cosine_similarity", embedding_cosine_similarity, 1); - rb_define_method(cEmbedding, "magnitude", embedding_magnitude, 0); - rb_define_method(cEmbedding, "normalize!", embedding_normalize_bang, 0); -} \ No newline at end of file diff --git a/ext/rag_embeddings/embedding.rs b/ext/rag_embeddings/embedding.rs new file mode 100644 index 0000000..a8a081e --- /dev/null +++ b/ext/rag_embeddings/embedding.rs @@ -0,0 +1,112 @@ +use magnus::{function, method, prelude::*, Error, Ruby, DataTypeFunctions, TypedData}; +use std::cell::RefCell; + +#[derive(TypedData)] +#[magnus(class = "RagEmbeddings::Embedding", free_immediately)] +struct Embedding { + values: RefCell>, +} + +impl DataTypeFunctions for Embedding { + fn size(&self) -> usize { + std::mem::size_of::() + self.values.borrow().capacity() * std::mem::size_of::() + } +} + +impl Embedding { + fn from_array(arr: Vec) -> Result { + if arr.is_empty() { + return Err(Error::new( + magnus::exception::arg_error(), + "Cannot create embedding from empty array", + )); + } + if arr.len() > u16::MAX as usize { + return Err(Error::new( + magnus::exception::arg_error(), + format!( + "Array too large: maximum {} dimensions allowed", + u16::MAX + ), + )); + } + Ok(Self { + values: RefCell::new(arr), + }) + } + + fn dim(&self) -> usize { + self.values.borrow().len() + } + + fn to_a(&self) -> Vec { + self.values.borrow().clone() + } + + fn cosine_similarity(&self, other: &Embedding) -> Result { + let a = self.values.borrow(); + let b = other.values.borrow(); + if a.len() != b.len() { + return Err(Error::new( + magnus::exception::arg_error(), + format!("Dimension mismatch: {} vs {}", a.len(), b.len()), + )); + } + let mut dot = 0.0f64; + let mut norm_a = 0.0f64; + let mut norm_b = 0.0f64; + for (ai, bi) in a.iter().zip(b.iter()) { + dot += *ai as f64 * *bi as f64; + norm_a += (*ai as f64) * (*ai as f64); + norm_b += (*bi as f64) * (*bi as f64); + } + if norm_a == 0.0 || norm_b == 0.0 { + return Ok(0.0); + } + let sim = dot / (norm_a * norm_b).sqrt(); + Ok(sim.clamp(-1.0, 1.0)) + } + + fn magnitude(&self) -> f64 { + let a = self.values.borrow(); + let mut sum = 0.0f64; + for v in a.iter() { + sum += (*v as f64) * (*v as f64); + } + sum.sqrt() + } + + fn normalize_bang(&self) -> Result<(), Error> { + let mut values = self.values.borrow_mut(); + let mut sum = 0.0f64; + for v in values.iter() { + sum += (*v as f64) * (*v as f64); + } + let magnitude = sum.sqrt(); + if magnitude == 0.0 { + return Err(Error::new( + magnus::exception::zero_div_error(), + "Cannot normalize zero vector", + )); + } + let inv_mag = 1.0 / magnitude as f32; + for v in values.iter_mut() { + *v *= inv_mag; + } + Ok(()) + } +} + +#[magnus::init] +fn init(ruby: &Ruby) -> Result<(), Error> { + let m_rag = ruby.define_module("RagEmbeddings")?; + let class = m_rag.define_class("Embedding", ruby.class_object())?; + class.undef_default_alloc_func(); + class.define_singleton_method("from_array", function!(Embedding::from_array, 1))?; + class.define_method("dim", method!(Embedding::dim, 0))?; + class.define_method("to_a", method!(Embedding::to_a, 0))?; + class.define_method("cosine_similarity", method!(Embedding::cosine_similarity, 1))?; + class.define_method("magnitude", method!(Embedding::magnitude, 0))?; + class.define_method("normalize!", method!(Embedding::normalize_bang, 0))?; + Ok(()) +} diff --git a/ext/rag_embeddings/extconf.rb b/ext/rag_embeddings/extconf.rb index 617416e..c8e9114 100644 --- a/ext/rag_embeddings/extconf.rb +++ b/ext/rag_embeddings/extconf.rb @@ -1,2 +1,4 @@ require "mkmf" -create_makefile("rag_embeddings/embedding") \ No newline at end of file +require "rb_sys/mkmf" + +create_rust_makefile("rag_embeddings/embedding") diff --git a/rag_embeddings.gemspec b/rag_embeddings.gemspec index 1db5139..0f28afa 100644 --- a/rag_embeddings.gemspec +++ b/rag_embeddings.gemspec @@ -11,7 +11,7 @@ Gem::Specification.new do |spec| spec.homepage = "https://rubygems.org/gems/rag_embeddings" spec.license = "MIT" - spec.files = Dir["README.md", "LICENSE", "lib/**/*.rb", "ext/**/*.{c,rb}", "Rakefile"] + spec.files = Dir["README.md", "LICENSE", "lib/**/*.rb", "ext/**/*.{rs,rb,toml}", "Rakefile"] spec.extensions = ["ext/rag_embeddings/extconf.rb"] spec.require_paths = ["lib", "ext"] @@ -21,6 +21,9 @@ Gem::Specification.new do |spec| spec.add_runtime_dependency "sqlite3" spec.add_runtime_dependency "langchainrb" spec.add_runtime_dependency "faraday" + spec.add_runtime_dependency "rb_sys", "~> 0.9" + + spec.add_development_dependency "rake-compiler", "~> 1.2" spec.add_development_dependency "rake" spec.add_development_dependency "rspec"