From 7d42be8371f7f0d2c7729ccdd4daf2c144240f50 Mon Sep 17 00:00:00 2001
From: rlaope <piyrw9754@gmail.com>
Date: Sat, 31 Jan 2026 16:58:27 +0900
Subject: [PATCH 1/3] feat :: tokenizer, airml options, embed

---
 crates/airml-core/src/engine.rs    |  27 ++++
 crates/airml-preprocess/src/lib.rs |   2 +-
 src/cli.rs                         |  37 ++++++
 src/commands/embed.rs              | 193 +++++++++++++++++++++++++++++
 src/commands/mod.rs                |   6 +
 src/main.rs                        |   2 +
 6 files changed, 266 insertions(+), 1 deletion(-)
 create mode 100644 src/commands/embed.rs
diff --git a/crates/airml-core/src/engine.rs b/crates/airml-core/src/engine.rs
index 43524cf..774464e 100644
--- a/crates/airml-core/src/engine.rs
+++ b/crates/airml-core/src/engine.rs
@@ -192,6 +192,33 @@ impl InferenceEngine {
         self.run_named(vec![(&input_name, input)])
     }
 
+    /// Run inference with multiple input tensors (matched by order)
+    pub fn run_multiple(&mut self, inputs: Vec<ArrayD<f32>>) -> Result<Vec<ArrayD<f32>>> {
+        if inputs.len() != self.metadata.inputs.len() {
+            return Err(AirMLError::ConfigError(format!(
+                "Expected {} inputs, got {}",
+                self.metadata.inputs.len(),
+                inputs.len()
+            )));
+        }
+
+        // Collect input names first to avoid borrow conflict
+        let input_names: Vec<String> = self
+            .metadata
+            .inputs
+            .iter()
+            .map(|info| info.name.clone())
+            .collect();
+
+        let named_inputs: Vec<(&str, ArrayD<f32>)> = input_names
+            .iter()
+            .zip(inputs.into_iter())
+            .map(|(name, arr)| (name.as_str(), arr))
+            .collect();
+
+        self.run_named(named_inputs)
+    }
+
     /// Run inference with named inputs
     pub fn run_named(&mut self, inputs: Vec<(&str, ArrayD<f32>)>) -> Result<Vec<ArrayD<f32>>> {
         // Create input tensors
diff --git a/crates/airml-preprocess/src/lib.rs b/crates/airml-preprocess/src/lib.rs
index 231bd4d..ef678ac 100644
--- a/crates/airml-preprocess/src/lib.rs
+++ b/crates/airml-preprocess/src/lib.rs
@@ -10,6 +10,6 @@ mod text;
 pub use image::{ImagePreprocessor, ResizeMode};
 
 #[cfg(feature = "nlp")]
-pub use text::TextPreprocessor;
+pub use text::{TextPreprocessor, TextPreprocessError, TokenizedInput};
 
 pub use ndarray;
diff --git a/src/cli.rs b/src/cli.rs
index 65dc30c..05c9ea0 100644
--- a/src/cli.rs
+++ b/src/cli.rs
@@ -34,6 +34,10 @@ pub enum Commands {
 
     /// Display system information
     System,
+
+    /// Generate text embeddings
+    #[cfg(feature = "nlp")]
+    Embed(EmbedArgs),
 }
 
 /// Arguments for the `run` command
@@ -103,3 +107,36 @@ pub struct BenchArgs {
     #[arg(long)]
     pub shape: Option<String>,
 }
+
+/// Arguments for the `embed` command
+#[cfg(feature = "nlp")]
+#[derive(Parser, Debug)]
+pub struct EmbedArgs {
+    /// Path to the ONNX embedding model file
+    #[arg(short, long)]
+    pub model: PathBuf,
+
+    /// Path to the tokenizer.json file
+    #[arg(short, long)]
+    pub tokenizer: PathBuf,
+
+    /// Text to embed
+    #[arg(long)]
+    pub text: String,
+
+    /// Maximum sequence length
+    #[arg(long, default_value = "512")]
+    pub max_length: usize,
+
+    /// Execution provider to use (cpu, coreml, neural-engine)
+    #[arg(short, long, default_value = "auto")]
+    pub provider: String,
+
+    /// Output format (json, raw)
+    #[arg(long, default_value = "json")]
+    pub output: String,
+
+    /// Normalize output embeddings (L2 normalization)
+    #[arg(long)]
+    pub normalize: bool,
+}
diff --git a/src/commands/embed.rs b/src/commands/embed.rs
new file mode 100644
index 0000000..130efcb
--- /dev/null
+++ b/src/commands/embed.rs
@@ -0,0 +1,193 @@
+//! Embed command implementation
+//!
+//! Generates text embeddings using ONNX models.
+
+use airml_core::{ndarray::ArrayD, InferenceEngine, SessionConfig};
+use airml_preprocess::TextPreprocessor;
+use airml_providers::auto_select_providers;
+use anyhow::{Context, Result};
+
+use crate::cli::EmbedArgs;
+
+/// Execute the embed command
+pub fn execute(args: &EmbedArgs, verbose: bool) -> Result<()> {
+    if verbose {
+        println!("Loading tokenizer: {}", args.tokenizer.display());
+    }
+
+    // Load tokenizer
+    let preprocessor = TextPreprocessor::from_file(&args.tokenizer)
+        .map_err(|e| anyhow::anyhow!("Failed to load tokenizer: {}", e))?
+        .with_max_length(args.max_length);
+
+    if verbose {
+        println!("Loading model: {}", args.model.display());
+    }
+
+    // Configure session with providers
+    let providers = select_providers(&args.provider)?;
+    let config = SessionConfig::new().with_providers(providers);
+
+    // Load model
+    let engine = InferenceEngine::from_file_with_config(&args.model, config)
+        .context("Failed to load model")?;
+
+    if verbose {
+        println!("Model inputs: {:?}", engine.inputs());
+        println!("Model outputs: {:?}", engine.outputs());
+    }
+
+    // Tokenize text
+    if verbose {
+        println!("Tokenizing text...");
+    }
+
+    let tokenized = preprocessor
+        .encode(&args.text)
+        .map_err(|e| anyhow::anyhow!("Failed to tokenize: {}", e))?;
+
+    let (input_ids, attention_mask) = tokenized.to_array();
+
+    if verbose {
+        println!("Token count: {}", tokenized.input_ids.len());
+    }
+
+    // Run inference
+    if verbose {
+        println!("Running inference...");
+    }
+
+    let mut engine = engine;
+
+    // Most embedding models expect input_ids and attention_mask
+    // We need to handle this based on model inputs
+    let model_inputs = engine.inputs();
+
+    let outputs = if model_inputs.len() >= 2 {
+        // Model expects multiple inputs (input_ids, attention_mask)
+        engine
+            .run_multiple(vec![
+                input_ids.into_dyn().mapv(|x| x as f32),
+                attention_mask.into_dyn().mapv(|x| x as f32),
+            ])
+            .context("Inference failed")?
+    } else {
+        // Model expects single input
+        engine
+            .run(input_ids.into_dyn().mapv(|x| x as f32))
+            .context("Inference failed")?
+    };
+
+    // Get embeddings from output
+    let embeddings = extract_embeddings(&outputs)?;
+
+    // Optionally normalize
+    let embeddings = if args.normalize {
+        l2_normalize(&embeddings)
+    } else {
+        embeddings
+    };
+
+    // Output results
+    match args.output.as_str() {
+        "json" => print_json(&embeddings, &args.text),
+        "raw" => print_raw(&embeddings),
+        _ => print_json(&embeddings, &args.text),
+    }
+
+    Ok(())
+}
+
+fn select_providers(provider_name: &str) -> Result<Vec<airml_providers::ExecutionProviderDispatch>> {
+    match provider_name {
+        "auto" => Ok(auto_select_providers()),
+        "cpu" => Ok(vec![airml_providers::CpuProvider::default().into_dispatch()]),
+        #[cfg(feature = "coreml")]
+        "coreml" => Ok(vec![airml_providers::CoreMLProvider::default().into_dispatch()]),
+        #[cfg(feature = "coreml")]
+        "neural-engine" => Ok(vec![
+            airml_providers::CoreMLProvider::default()
+                .neural_engine_only()
+                .into_dispatch(),
+        ]),
+        _ => {
+            println!("Warning: Unknown provider '{}', using auto-selection", provider_name);
+            Ok(auto_select_providers())
+        }
+    }
+}
+
+fn extract_embeddings(outputs: &[ArrayD<f32>]) -> Result<Vec<f32>> {
+    let output = outputs.first().context("No output from model")?;
+
+    // Handle different output shapes:
+    // - [batch, seq_len, hidden] -> take [CLS] token or mean pooling
+    // - [batch, hidden] -> direct embedding
+    let shape = output.shape();
+
+    let embeddings: Vec<f32> = match shape.len() {
+        2 => {
+            // [batch, hidden] - direct embedding
+            output.iter().copied().collect()
+        }
+        3 => {
+            // [batch, seq_len, hidden] - use mean pooling
+            let hidden_size = shape[2];
+            let seq_len = shape[1];
+
+            // Mean pooling across sequence dimension
+            let mut pooled = vec![0.0f32; hidden_size];
+            for i in 0..seq_len {
+                for j in 0..hidden_size {
+                    pooled[j] += output[[0, i, j]];
+                }
+            }
+            for v in &mut pooled {
+                *v /= seq_len as f32;
+            }
+            pooled
+        }
+        _ => {
+            // Flatten whatever we get
+            output.iter().copied().collect()
+        }
+    };
+
+    Ok(embeddings)
+}
+
+fn l2_normalize(vec: &[f32]) -> Vec<f32> {
+    let norm: f32 = vec.iter().map(|x| x * x).sum::<f32>().sqrt();
+    if norm > 0.0 {
+        vec.iter().map(|x| x / norm).collect()
+    } else {
+        vec.to_vec()
+    }
+}
+
+fn print_json(embeddings: &[f32], text: &str) {
+    println!("{{");
+    println!("  \"text\": {:?},", text);
+    println!("  \"dimension\": {},", embeddings.len());
+    println!("  \"embedding\": [");
+
+    let chunk_size = 8;
+    for (i, chunk) in embeddings.chunks(chunk_size).enumerate() {
+        let values: Vec<String> = chunk.iter().map(|v| format!("{:.6}", v)).collect();
+        let is_last = (i + 1) * chunk_size >= embeddings.len();
+        println!(
+            "    {}{}",
+            values.join(", "),
+            if is_last { "" } else { "," }
+        );
+    }
+
+    println!("  ]");
+    println!("}}");
+}
+
+fn print_raw(embeddings: &[f32]) {
+    for v in embeddings {
+        println!("{:.6}", v);
+    }
+}
diff --git a/src/commands/mod.rs b/src/commands/mod.rs
index e7025cc..b5562aa 100644
--- a/src/commands/mod.rs
+++ b/src/commands/mod.rs
@@ -7,7 +7,13 @@ pub mod info;
 pub mod run;
 pub mod system;
 
+#[cfg(feature = "nlp")]
+pub mod embed;
+
 pub use bench::execute as bench;
 pub use info::execute as info;
 pub use run::execute as run;
 pub use system::execute as system;
+
+#[cfg(feature = "nlp")]
+pub use embed::execute as embed;
diff --git a/src/main.rs b/src/main.rs
index 69b541a..dc5468c 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -25,5 +25,7 @@ fn main() -> Result<()> {
         Commands::Info(args) => commands::info(args),
         Commands::Bench(args) => commands::bench(args),
         Commands::System => commands::system(),
+        #[cfg(feature = "nlp")]
+        Commands::Embed(args) => commands::embed(args, cli.verbose),
     }
 }

From dd2d70c71ad58ff8f1f75695628ce0a08f23352c Mon Sep 17 00:00:00 2001
From: rlaope <piyrw9754@gmail.com>
Date: Sat, 31 Jan 2026 17:06:31 +0900
Subject: [PATCH 2/3] docs :: readme, guide air ml

---
 README.md            | 192 +++++++++++-----
 docs/API.md          | 505 +++++++++++++++++++++++++++++++++++++++++++
 docs/ARCHITECTURE.md | 252 +++++++++++++++++++++
 docs/TUTORIAL.md     | 380 ++++++++++++++++++++++++++++++++
 4 files changed, 1277 insertions(+), 52 deletions(-)
 create mode 100644 docs/API.md
 create mode 100644 docs/ARCHITECTURE.md
 create mode 100644 docs/TUTORIAL.md

diff --git a/README.md b/README.md
index 87e6f04..c12b0b1 100644
--- a/README.md
+++ b/README.md
@@ -6,9 +6,10 @@ A lightweight ML runtime that runs ONNX models without Python. Fast, portable, a
 
 - **Single Binary**: Deploy ML models with a single ~50MB binary
 - **Fast Cold Start**: 0.01-0.05s startup time (100x faster than Python)
-- **Apple Silicon Acceleration**: Native Metal/CoreML support for M-series chips
+- **Apple Silicon Acceleration**: Native CoreML/Metal/Neural Engine support
 - **ONNX Support**: Run models exported from PyTorch, TensorFlow, and more
 - **Zero Dependencies**: No Python, no virtual environments, no package managers
+- **NLP Support**: Text tokenization and embedding generation
 
 ## Installation
 
@@ -19,9 +20,12 @@ A lightweight ML runtime that runs ONNX models without Python. Fast, portable, a
 git clone https://github.com/airml/airml.git
 cd airml
 
-# Build release binary
+# Build release binary (CPU only)
 cargo build --release
 
+# Build with all features (macOS)
+cargo build --release --features coreml,nlp
+
 # Optional: Install to PATH
 cargo install --path .
 ```
@@ -32,25 +36,63 @@ Download from [Releases](https://github.com/airml/airml/releases).
 
 ## Quick Start
 
+### Image Classification
+
 ```bash
-# Run image classification
-airml run --model resnet50.onnx --input cat.jpg --labels imagenet_labels.txt
+# Run classification on an image
+airml run -m resnet50.onnx -i cat.jpg -l imagenet_labels.txt
+
+# Output:
+# Top 5 predictions:
+# --------------------------------------------------
+#  281  95.23% ======================================== tabby
+#  282   3.12% === tiger cat
+#  285   0.89% = Egyptian cat
+```
+
+### Text Embeddings
+
+```bash
+# Generate text embeddings
+airml embed -m sentence-transformer.onnx -t tokenizer.json --text "Hello world"
+
+# Output:
+# {
+#   "text": "Hello world",
+#   "dimension": 384,
+#   "embedding": [0.123456, 0.234567, ...]
+# }
+```
 
-# Display model information
-airml info --model resnet50.onnx
+### Benchmarking
 
+```bash
 # Benchmark inference performance
-airml bench --model resnet50.onnx -n 100
+airml bench -m model.onnx -n 100 -p neural-engine
+
+# Output:
+# Mean latency:     12.34 ms
+# Throughput:       81.00 inferences/sec
+```
+
+### System Info
 
-# Check system capabilities
+```bash
+# Check available providers
 airml system
+
+# Output:
+# OS: macos
+# Architecture: aarch64
+# Apple Silicon: true
+# Available providers: cpu, coreml
 ```
 
 ## CLI Reference
 
 ### `airml run`
 
-Run inference on an input.
+Run inference on an input image.
 
 ```bash
 airml run --model <MODEL> --input <INPUT> [OPTIONS]
@@ -58,23 +100,36 @@ airml run --model <MODEL> --input <INPUT> [OPTIONS]
 Options:
   -m, --model <MODEL>       Path to ONNX model file
   -i, --input <INPUT>       Path to input file (image)
-  -l, --labels <LABELS>     Path to labels file (one label per line)
-  -k, --top-k <N>           Number of top predictions to show [default: 5]
-  -p, --provider <PROVIDER> Execution provider (auto, cpu, coreml) [default: auto]
-      --preprocess <PRESET> Preprocessing preset (imagenet, clip, yolo, none) [default: imagenet]
+  -l, --labels <LABELS>     Path to labels file
+  -k, --top-k <N>           Top predictions to show [default: 5]
+  -p, --provider <PROVIDER> Execution provider (auto, cpu, coreml, neural-engine)
+      --preprocess <PRESET> Preprocessing (imagenet, clip, yolo, none)
       --raw                 Output raw tensor values
 ```
 
-### `airml info`
+### `airml embed`
 
-Display model information.
+Generate text embeddings (requires `nlp` feature).
 
 ```bash
-airml info --model <MODEL> [OPTIONS]
+airml embed --model <MODEL> --tokenizer <TOKENIZER> --text <TEXT> [OPTIONS]
 
 Options:
-  -m, --model <MODEL>  Path to ONNX model file
-  -v, --verbose        Show detailed information
+  -m, --model <MODEL>          ONNX embedding model
+  -t, --tokenizer <TOKENIZER>  tokenizer.json file
+      --text <TEXT>            Text to embed
+      --max-length <N>         Max sequence length [default: 512]
+  -p, --provider <PROVIDER>    Execution provider
+      --output <FORMAT>        Output format (json, raw)
+      --normalize              L2 normalize embeddings
+```
+
+### `airml info`
+
+Display model information.
+
+```bash
+airml info --model <MODEL> [-v]
 ```
 
 ### `airml bench`
@@ -85,78 +140,111 @@ Benchmark inference performance.
 airml bench --model <MODEL> [OPTIONS]
 
 Options:
-  -m, --model <MODEL>      Path to ONNX model file
-  -n, --iterations <N>     Number of iterations [default: 100]
+  -n, --iterations <N>     Iterations [default: 100]
   -w, --warmup <N>         Warmup iterations [default: 10]
-  -p, --provider <PROVIDER> Execution provider [default: auto]
+  -p, --provider <PROVIDER> Execution provider
       --shape <SHAPE>      Input shape (e.g., "1,3,224,224")
 ```
 
 ### `airml system`
 
-Display system information and available providers.
+Display system capabilities.
+
+## Execution Providers
+
+| Provider | Platform | Hardware | Flag |
+|----------|----------|----------|------|
+| CPU | All | Any CPU | (default) |
+| CoreML | macOS | Apple Silicon | `--features coreml` |
+| Neural Engine | macOS | M1/M2/M3 ANE | `--features coreml` |
 
 ```bash
-airml system
+# Build with specific providers
+cargo build --release                      # CPU only
+cargo build --release --features coreml    # + CoreML
+cargo build --release --features nlp       # + NLP
+cargo build --release --features coreml,nlp # All features
 ```
 
-## Execution Providers
+## Performance
 
-| Provider | Platform | Hardware |
-|----------|----------|----------|
-| CPU | All | Any CPU |
-| CoreML | macOS | Apple Silicon (M1/M2/M3) |
+Benchmarked on Apple M2 with ResNet50:
 
-Enable providers with feature flags:
+| Provider | Latency | Throughput |
+|----------|---------|------------|
+| CPU | ~50ms | ~20 inf/s |
+| CoreML (All) | ~15ms | ~65 inf/s |
+| Neural Engine | ~8ms | ~125 inf/s |
 
-```bash
-# CPU only (default)
-cargo build --release
+| Metric | airML | Python (PyTorch) |
+|--------|-------|------------------|
+| Binary Size | ~50MB | ~2GB |
+| Cold Start | 0.01-0.05s | 2-5s |
+| Memory Usage | ~100MB | ~500MB+ |
 
-# With CoreML support
-cargo build --release --features coreml
-```
+## Using as a Library
+
+```rust
+use airml_core::{InferenceEngine, SessionConfig};
+use airml_preprocess::ImagePreprocessor;
+use airml_providers::CoreMLProvider;
+
+fn main() -> anyhow::Result<()> {
+    // Configure with CoreML
+    let providers = vec![CoreMLProvider::default().neural_engine_only().into_dispatch()];
+    let config = SessionConfig::new().with_providers(providers);
+
+    // Load model
+    let mut engine = InferenceEngine::from_file_with_config("model.onnx", config)?;
+
+    // Preprocess and run
+    let input = ImagePreprocessor::imagenet().load_and_process("image.jpg")?;
+    let outputs = engine.run(input.into_dyn())?;
 
-## Embedding Models
+    Ok(())
+}
+```
 
-Embed ONNX models directly in your binary:
+## Embedding Models in Binary
 
 ```rust
 use airml_embed::EmbeddedModel;
 
-// Embed at compile time
-static MODEL_BYTES: &[u8] = include_bytes!("../models/resnet50.onnx");
+static MODEL: &[u8] = include_bytes!("model.onnx");
 
 fn main() -> anyhow::Result<()> {
-    let model = EmbeddedModel::new(MODEL_BYTES);
-    let engine = model.into_engine()?;
-
-    // Run inference...
+    let engine = EmbeddedModel::new(MODEL).into_engine()?;
+    // Use engine...
     Ok(())
 }
 ```
 
-## Performance
-
-| Metric | airML | Python (PyTorch) |
-|--------|-------|------------------|
-| Binary Size | ~50MB | ~2GB |
-| Cold Start | 0.01-0.05s | 2-5s |
-| Memory Usage | ~100MB | ~500MB+ |
-
 ## Project Structure
 
 ```
 airML/
 ├── crates/
-│   ├── airml-core/        # Inference engine
+│   ├── airml-core/        # Inference engine (ONNX Runtime wrapper)
 │   ├── airml-preprocess/  # Image/text preprocessing
 │   ├── airml-providers/   # Execution providers (CPU, CoreML)
 │   └── airml-embed/       # Model embedding utilities
 ├── src/                   # CLI binary
+│   ├── main.rs
+│   ├── cli.rs             # Argument parsing
+│   └── commands/          # Command implementations
+├── docs/                  # Documentation
+│   ├── ARCHITECTURE.md    # Internal architecture
+│   ├── TUTORIAL.md        # Step-by-step tutorials
+│   └── API.md             # API reference
 └── models/                # Test models (gitignored)
 ```
 
+## Documentation
+
+- [Architecture](docs/ARCHITECTURE.md) - Internal design and data flow
+- [Tutorial](docs/TUTORIAL.md) - Step-by-step guides
+- [API Reference](docs/API.md) - Complete API documentation
+
 ## License
 
 MIT License - see [LICENSE](LICENSE) for details.
diff --git a/docs/API.md b/docs/API.md
new file mode 100644
index 0000000..6a6120e
--- /dev/null
+++ b/docs/API.md
@@ -0,0 +1,505 @@
+# airML API Reference
+
+Complete API documentation for airML crates.
+
+## airml-core
+
+### InferenceEngine
+
+The main interface for loading and running ONNX models.
+
+```rust
+use airml_core::{InferenceEngine, SessionConfig};
+```
+
+#### Constructors
+
+```rust
+// Load from file with default config
+let engine = InferenceEngine::from_file("model.onnx")?;
+
+// Load from file with custom config
+let config = SessionConfig::new().with_intra_threads(4);
+let engine = InferenceEngine::from_file_with_config("model.onnx", config)?;
+
+// Load from bytes (for embedded models)
+let engine = InferenceEngine::from_bytes(model_bytes)?;
+
+// Load from bytes with custom config
+let engine = InferenceEngine::from_bytes_with_config(model_bytes, config)?;
+```
+
+#### Methods
+
+```rust
+// Run inference with single input
+pub fn run(&mut self, input: ArrayD<f32>) -> Result<Vec<ArrayD<f32>>>
+
+// Run inference with multiple inputs (matched by order)
+pub fn run_multiple(&mut self, inputs: Vec<ArrayD<f32>>) -> Result<Vec<ArrayD<f32>>>
+
+// Run inference with named inputs
+pub fn run_named(&mut self, inputs: Vec<(&str, ArrayD<f32>)>) -> Result<Vec<ArrayD<f32>>>
+
+// Get model metadata
+pub fn metadata(&self) -> &ModelMetadata
+
+// Get input tensor info
+pub fn inputs(&self) -> &[TensorInfo]
+
+// Get output tensor info
+pub fn outputs(&self) -> &[TensorInfo]
+```
+
+### SessionConfig
+
+Configuration for ONNX Runtime sessions.
+
+```rust
+use airml_core::SessionConfig;
+```
+
+#### Builder Methods
+
+```rust
+let config = SessionConfig::new()
+    .with_intra_threads(4)          // Threads within operators
+    .with_inter_threads(2)          // Threads between operators
+    .with_optimization_level(level) // Graph optimization
+    .with_providers(providers);     // Execution providers
+```
+
+### ModelMetadata
+
+Information about a loaded model.
+
+```rust
+pub struct ModelMetadata {
+    pub name: Option<String>,
+    pub description: Option<String>,
+    pub version: Option<i64>,
+    pub producer: Option<String>,
+    pub inputs: Vec<TensorInfo>,
+    pub outputs: Vec<TensorInfo>,
+}
+```
+
+### TensorInfo
+
+Information about model inputs/outputs.
+
+```rust
+pub struct TensorInfo {
+    pub name: String,
+    pub shape: Vec<i64>,  // -1 for dynamic dimensions
+    pub dtype: String,
+}
+```
+
+### AirMLError
+
+Error types for the core module.
+
+```rust
+pub enum AirMLError {
+    ModelNotFound(String),
+    ModelLoadError(String),
+    InferenceError(String),
+    PreprocessError(String),
+    ConfigError(String),
+    OrtError(String),
+}
+```
+
+---
+
+## airml-preprocess
+
+### ImagePreprocessor
+
+Image preprocessing for vision models.
+
+```rust
+use airml_preprocess::{ImagePreprocessor, ResizeMode};
+```
+
+#### Presets
+
+```rust
+// ImageNet preset (224x224, standard normalization)
+let preprocessor = ImagePreprocessor::imagenet();
+
+// CLIP preset (224x224, CLIP normalization)
+let preprocessor = ImagePreprocessor::clip();
+
+// YOLO preset (640x640, no normalization, letterbox)
+let preprocessor = ImagePreprocessor::yolo(640);
+
+// Custom preset
+let preprocessor = ImagePreprocessor::custom(
+    width,       // u32
+    height,      // u32
+    mean,        // [f32; 3]
+    std,         // [f32; 3]
+);
+```
+
+#### Methods
+
+```rust
+// Load image and preprocess
+pub fn load_and_process<P: AsRef<Path>>(&self, path: P) -> Result<Array4<f32>>
+
+// Preprocess already loaded image
+pub fn process(&self, image: &DynamicImage) -> Result<Array4<f32>>
+```
+
+#### Fields
+
+```rust
+pub struct ImagePreprocessor {
+    pub width: u32,
+    pub height: u32,
+    pub mean: [f32; 3],
+    pub std: [f32; 3],
+    pub resize_mode: ResizeMode,
+}
+```
+
+### ResizeMode
+
+How to resize images to target dimensions.
+
+```rust
+pub enum ResizeMode {
+    Stretch,    // Stretch to fit (may distort)
+    Crop,       // Center crop to fit
+    Letterbox,  // Pad to fit (preserve aspect ratio)
+}
+```
+
+### TextPreprocessor (NLP feature)
+
+Text preprocessing with tokenization.
+
+```rust
+#[cfg(feature = "nlp")]
+use airml_preprocess::{TextPreprocessor, TokenizedInput, TextPreprocessError};
+```
+
+#### Constructors
+
+```rust
+// Load from tokenizer.json file
+let preprocessor = TextPreprocessor::from_file("tokenizer.json")?;
+
+// Load from bytes
+let preprocessor = TextPreprocessor::from_bytes(tokenizer_bytes)?;
+```
+
+#### Builder Methods
+
+```rust
+let preprocessor = TextPreprocessor::from_file("tokenizer.json")?
+    .with_max_length(512)       // Maximum sequence length
+    .with_padding(true)         // Pad to max_length
+    .with_truncation(true);     // Truncate if too long
+```
+
+#### Methods
+
+```rust
+// Encode single text
+pub fn encode(&self, text: &str) -> Result<TokenizedInput>
+
+// Encode batch of texts
+pub fn encode_batch(&self, texts: &[&str]) -> Result<Vec<TokenizedInput>>
+```
+
+### TokenizedInput
+
+Result of tokenization.
+
+```rust
+pub struct TokenizedInput {
+    pub input_ids: Vec<u32>,
+    pub attention_mask: Vec<u32>,
+}
+
+impl TokenizedInput {
+    // Convert to ndarray for model input
+    pub fn to_array(&self) -> (Array2<i64>, Array2<i64>)
+}
+```
+
+### TextPreprocessError
+
+Errors from text preprocessing.
+
+```rust
+pub enum TextPreprocessError {
+    LoadError(String),
+    EncodeError(String),
+    TextTooLong { actual: usize, max: usize },
+}
+```
+
+---
+
+## airml-providers
+
+### CpuProvider
+
+CPU execution provider (always available).
+
+```rust
+use airml_providers::CpuProvider;
+
+let provider = CpuProvider::default().into_dispatch();
+```
+
+### CoreMLProvider (coreml feature)
+
+CoreML execution provider for macOS.
+
+```rust
+#[cfg(feature = "coreml")]
+use airml_providers::{CoreMLProvider, ComputeUnits, CoreMLConfig};
+```
+
+#### Constructors
+
+```rust
+// Default (use all compute units)
+let provider = CoreMLProvider::new();
+let provider = CoreMLProvider::default();
+
+// With custom config
+let config = CoreMLConfig { ... };
+let provider = CoreMLProvider::with_config(config);
+```
+
+#### Builder Methods
+
+```rust
+let provider = CoreMLProvider::default()
+    .with_compute_units(ComputeUnits::CpuAndNeuralEngine)
+    .with_subgraphs(true)           // Enable for control flow models
+    .with_static_shapes(false)      // Require static input shapes
+    .with_model_format(format)      // NeuralNetwork or MLProgram
+    .with_cache_dir("/path/to/cache");
+```
+
+#### Convenience Methods
+
+```rust
+// Optimize for Neural Engine
+let provider = CoreMLProvider::default().neural_engine_only();
+
+// Use GPU only (no ANE)
+let provider = CoreMLProvider::default().gpu_only();
+
+// Use CPU only
+let provider = CoreMLProvider::default().cpu_only();
+```
+
+#### Conversion
+
+```rust
+// Convert to ExecutionProviderDispatch for use with SessionConfig
+let dispatch = provider.into_dispatch();
+```
+
+### ComputeUnits
+
+Hardware targets for CoreML.
+
+```rust
+pub enum ComputeUnits {
+    All,                  // CPU + GPU + Neural Engine
+    CpuAndNeuralEngine,   // CPU + Neural Engine (no GPU)
+    CpuAndGpu,            // CPU + GPU (no ANE)
+    CpuOnly,              // CPU only
+}
+```
+
+### CoreMLConfig
+
+Full configuration for CoreML provider.
+
+```rust
+pub struct CoreMLConfig {
+    pub compute_units: ComputeUnits,
+    pub enable_subgraphs: bool,
+    pub require_static_shapes: bool,
+    pub model_format: Option<CoreMLModelFormat>,
+    pub cache_dir: Option<String>,
+}
+```
+
+### CoreMLModelFormat
+
+Model format for CoreML.
+
+```rust
+pub enum CoreMLModelFormat {
+    NeuralNetwork,  // Better compatibility with older macOS/iOS
+    MLProgram,      // More operators, potentially better performance
+}
+```
+
+### Utility Functions
+
+```rust
+// Check if running on Apple Silicon
+pub fn is_apple_silicon() -> bool
+
+// Auto-select best available providers
+pub fn auto_select_providers() -> Vec<ExecutionProviderDispatch>
+
+// List available provider names
+pub fn available_providers() -> Vec<String>
+
+// Get system information
+pub fn system_info() -> SystemInfo
+```
+
+### SystemInfo
+
+System capability information.
+
+```rust
+pub struct SystemInfo {
+    pub os: String,
+    pub arch: String,
+    pub is_apple_silicon: bool,
+    pub available_providers: Vec<String>,
+}
+```
+
+---
+
+## airml-embed
+
+### EmbeddedModel
+
+Wrapper for models embedded in binaries.
+
+```rust
+use airml_embed::EmbeddedModel;
+
+// Embed at compile time
+static MODEL: &[u8] = include_bytes!("model.onnx");
+```
+
+#### Constructors
+
+```rust
+// From bytes
+let model = EmbeddedModel::new(MODEL);
+
+// With custom config
+let model = EmbeddedModel::with_config(MODEL, config);
+```
+
+#### Methods
+
+```rust
+// Get model bytes
+pub fn bytes(&self) -> &[u8]
+
+// Get model size
+pub fn size(&self) -> usize
+
+// Set configuration
+pub fn config(self, config: SessionConfig) -> Self
+
+// Convert to InferenceEngine
+pub fn into_engine(self) -> Result<InferenceEngine>
+```
+
+### embed_model! Macro
+
+Macro for embedding models.
+
+```rust
+use airml_embed::embed_model;
+
+// Creates static EmbeddedModel
+embed_model!(RESNET, "../models/resnet50.onnx");
+
+fn main() {
+    let engine = RESNET.clone().into_engine().unwrap();
+}
+```
+
+---
+
+## CLI Commands
+
+### airml run
+
+```
+airml run [OPTIONS] --model <MODEL> --input <INPUT>
+
+Options:
+  -m, --model <MODEL>         ONNX model path
+  -i, --input <INPUT>         Input image path
+  -l, --labels <LABELS>       Labels file (one per line)
+  -k, --top-k <K>             Top K predictions [default: 5]
+  -p, --provider <PROVIDER>   Execution provider [default: auto]
+      --preprocess <PRESET>   Preprocessing preset [default: imagenet]
+      --raw                   Output raw tensors
+  -v, --verbose               Verbose output
+```
+
+### airml info
+
+```
+airml info [OPTIONS] --model <MODEL>
+
+Options:
+  -m, --model <MODEL>   ONNX model path
+  -v, --verbose         Detailed information
+```
+
+### airml bench
+
+```
+airml bench [OPTIONS] --model <MODEL>
+
+Options:
+  -m, --model <MODEL>        ONNX model path
+  -n, --iterations <N>       Benchmark iterations [default: 100]
+  -w, --warmup <N>           Warmup iterations [default: 10]
+  -p, --provider <PROVIDER>  Execution provider [default: auto]
+      --shape <SHAPE>        Input shape (e.g., "1,3,224,224")
+```
+
+### airml embed
+
+```
+airml embed [OPTIONS] --model <MODEL> --tokenizer <TOKENIZER> --text <TEXT>
+
+Options:
+  -m, --model <MODEL>          ONNX embedding model path
+  -t, --tokenizer <TOKENIZER>  tokenizer.json path
+      --text <TEXT>            Text to embed
+      --max-length <N>         Max sequence length [default: 512]
+  -p, --provider <PROVIDER>    Execution provider [default: auto]
+      --output <FORMAT>        Output format (json, raw) [default: json]
+      --normalize              L2 normalize embeddings
+  -v, --verbose                Verbose output
+```
+
+### airml system
+
+```
+airml system
+
+Displays:
+  - Operating system
+  - CPU architecture
+  - Apple Silicon detection
+  - Available execution providers
+```
diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md
new file mode 100644
index 0000000..eefc1f9
--- /dev/null
+++ b/docs/ARCHITECTURE.md
@@ -0,0 +1,252 @@
+# airML Architecture
+
+This document explains the internal architecture of airML.
+
+## Overview
+
+airML is a lightweight ML inference runtime built in Rust. It provides a CLI for running ONNX models without Python dependencies.
+
+```
+┌─────────────────────────────────────────────────────────────┐
+│                        airML CLI                            │
+│  (run, info, bench, system, embed commands)                 │
+└─────────────────────────────────────────────────────────────┘
+                              │
+              ┌───────────────┼───────────────┐
+              ▼               ▼               ▼
+┌─────────────────┐  ┌─────────────────┐  ┌─────────────────┐
+│   airml-core    │  │ airml-preprocess│  │ airml-providers │
+│                 │  │                 │  │                 │
+│ • InferenceEngine  │ • ImagePreprocessor  │ • CpuProvider     │
+│ • SessionConfig │  │ • TextPreprocessor   │ • CoreMLProvider  │
+│ • ModelMetadata │  │ • TokenizedInput│  │ • ComputeUnits  │
+└─────────────────┘  └─────────────────┘  └─────────────────┘
+              │               │               │
+              └───────────────┼───────────────┘
+                              ▼
+                    ┌─────────────────┐
+                    │       ort       │
+                    │  (ONNX Runtime) │
+                    └─────────────────┘
+```
+
+## Crates
+
+### airml-core
+
+The core inference engine that wraps ONNX Runtime.
+
+**Key Components:**
+
+- `InferenceEngine` - Main interface for loading and running models
+- `SessionConfig` - Configuration for ORT sessions (threads, optimization level, providers)
+- `ModelMetadata` - Model information (inputs, outputs, name)
+- `TensorInfo` - Tensor shape and dtype information
+
+**Flow:**
+
+```
+Model File (.onnx)
+       │
+       ▼
+┌─────────────────┐
+│ InferenceEngine │
+│ ::from_file()   │
+└─────────────────┘
+       │
+       ▼
+┌─────────────────┐
+│  ORT Session    │
+│  (internal)     │
+└─────────────────┘
+       │
+       ▼
+engine.run(input) → outputs
+```
+
+### airml-preprocess
+
+Input preprocessing for images and text.
+
+**Image Preprocessing:**
+
+```rust
+let preprocessor = ImagePreprocessor::imagenet();
+let tensor = preprocessor.load_and_process("image.jpg")?;
+// tensor: [1, 3, 224, 224] f32
+```
+
+**Presets:**
+
+| Preset | Size | Mean | Std |
+|--------|------|------|-----|
+| ImageNet | 224x224 | [0.485, 0.456, 0.406] | [0.229, 0.224, 0.225] |
+| CLIP | 224x224 | [0.481, 0.458, 0.408] | [0.269, 0.261, 0.276] |
+| YOLO | 640x640 | [0, 0, 0] | [1, 1, 1] |
+
+**Text Preprocessing (NLP feature):**
+
+```rust
+let preprocessor = TextPreprocessor::from_file("tokenizer.json")?
+    .with_max_length(512);
+let tokenized = preprocessor.encode("Hello world")?;
+// tokenized.input_ids: [101, 7592, 2088, 102, 0, 0, ...]
+// tokenized.attention_mask: [1, 1, 1, 1, 0, 0, ...]
+```
+
+### airml-providers
+
+Execution provider abstraction for hardware acceleration.
+
+**Available Providers:**
+
+| Provider | Feature Flag | Hardware |
+|----------|--------------|----------|
+| CPU | (default) | Any |
+| CoreML | `coreml` | Apple Silicon |
+
+**CoreML ComputeUnits:**
+
+```rust
+// Use all available hardware (CPU + GPU + Neural Engine)
+CoreMLProvider::default()
+
+// Optimize for Neural Engine
+CoreMLProvider::default().neural_engine_only()
+
+// Use GPU only (no ANE)
+CoreMLProvider::default().gpu_only()
+
+// CPU only (for debugging)
+CoreMLProvider::default().cpu_only()
+```
+
+### airml-embed
+
+Utilities for embedding models in Rust binaries.
+
+```rust
+use airml_embed::EmbeddedModel;
+
+static MODEL: &[u8] = include_bytes!("model.onnx");
+
+fn main() {
+    let engine = EmbeddedModel::new(MODEL).into_engine()?;
+}
+```
+
+## Data Flow
+
+### Image Classification
+
+```
+┌──────────┐    ┌─────────────────┐    ┌─────────────────┐    ┌──────────┐
+│  Image   │───▶│ImagePreprocessor│───▶│ InferenceEngine │───▶│ Softmax  │
+│ (JPEG)   │    │                 │    │                 │    │ + Top-K  │
+└──────────┘    └─────────────────┘    └─────────────────┘    └──────────┘
+                      │                        │
+                      ▼                        ▼
+               [1,3,224,224]            [1,1000] logits
+```
+
+### Text Embedding
+
+```
+┌──────────┐    ┌─────────────────┐    ┌─────────────────┐    ┌──────────┐
+│  Text    │───▶│ TextPreprocessor│───▶│ InferenceEngine │───▶│ Pooling  │
+│ (String) │    │                 │    │                 │    │ + L2 Norm│
+└──────────┘    └─────────────────┘    └─────────────────┘    └──────────┘
+                      │                        │
+                      ▼                        ▼
+              [1,512] input_ids         [1,seq,768] or
+              [1,512] attention_mask    [1,768] embedding
+```
+
+## CLI Commands
+
+### run
+
+Executes model inference on input.
+
+```
+run command
+    │
+    ├── select_providers() → providers
+    ├── InferenceEngine::from_file_with_config()
+    ├── create_preprocessor() → ImagePreprocessor
+    ├── preprocessor.load_and_process()
+    ├── engine.run()
+    └── print_classification_output() or print_raw_output()
+```
+
+### bench
+
+Benchmarks inference performance.
+
+```
+bench command
+    │
+    ├── InferenceEngine::from_file_with_config()
+    ├── create_random_input()
+    ├── warmup: N iterations (results discarded)
+    ├── benchmark: N iterations (times recorded)
+    └── calculate_stats() → mean, median, p50/90/95/99, throughput
+```
+
+### embed
+
+Generates text embeddings.
+
+```
+embed command
+    │
+    ├── TextPreprocessor::from_file()
+    ├── InferenceEngine::from_file_with_config()
+    ├── preprocessor.encode()
+    ├── engine.run() or engine.run_multiple()
+    ├── extract_embeddings() → mean pooling if 3D
+    ├── l2_normalize() (optional)
+    └── print_json() or print_raw()
+```
+
+## Feature Flags
+
+| Flag | Description | Dependencies |
+|------|-------------|--------------|
+| `cpu` | CPU execution (default) | - |
+| `coreml` | CoreML/Metal acceleration | macOS only |
+| `nlp` | Text preprocessing | tokenizers crate |
+
+## Error Handling
+
+All errors are wrapped in `AirMLError`:
+
+```rust
+pub enum AirMLError {
+    ModelNotFound(String),
+    ModelLoadError(String),
+    InferenceError(String),
+    PreprocessError(String),
+    ConfigError(String),
+    OrtError(String),
+}
+```
+
+## Thread Configuration
+
+```rust
+let config = SessionConfig::new()
+    .with_intra_threads(4)  // Threads within an operator
+    .with_inter_threads(2); // Threads between operators
+```
+
+## Optimization Levels
+
+```rust
+pub enum OptimizationLevel {
+    None,    // No optimization
+    Basic,   // Basic graph optimizations
+    Extended,// Extended optimizations
+    All,     // All optimizations (default)
+}
+```
diff --git a/docs/TUTORIAL.md b/docs/TUTORIAL.md
new file mode 100644
index 0000000..e1eb196
--- /dev/null
+++ b/docs/TUTORIAL.md
@@ -0,0 +1,380 @@
+# airML Tutorial
+
+This tutorial walks you through using airML for common ML tasks.
+
+## Prerequisites
+
+- Rust toolchain (1.70+)
+- macOS (for CoreML support) or Linux/Windows (CPU only)
+
+## Installation
+
+```bash
+# Clone and build
+git clone https://github.com/airml/airml.git
+cd airml
+
+# Build with all features
+cargo build --release --features coreml,nlp
+
+# Add to PATH (optional)
+export PATH="$PWD/target/release:$PATH"
+```
+
+## Tutorial 1: Image Classification
+
+### Step 1: Get a Model
+
+Download a pre-trained ResNet50 model:
+
+```bash
+# From ONNX Model Zoo
+curl -L -o resnet50.onnx \
+  "https://github.com/onnx/models/raw/main/validated/vision/classification/resnet/model/resnet50-v2-7.onnx"
+```
+
+### Step 2: Get Labels
+
+```bash
+curl -L -o imagenet_labels.txt \
+  "https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt"
+```
+
+### Step 3: Run Inference
+
+```bash
+# Basic usage
+airml run -m resnet50.onnx -i cat.jpg -l imagenet_labels.txt
+
+# Output:
+# Top 5 predictions:
+# --------------------------------------------------
+#  281  95.23% ======================================== tabby
+#  282   3.12% === tiger cat
+#  285   0.89% = Egyptian cat
+#  287   0.34%  lynx
+#  283   0.21%  Persian cat
+```
+
+### Step 4: Try Different Providers
+
+```bash
+# CPU only
+airml run -m resnet50.onnx -i cat.jpg -l imagenet_labels.txt -p cpu
+
+# CoreML (macOS)
+airml run -m resnet50.onnx -i cat.jpg -l imagenet_labels.txt -p coreml
+
+# Neural Engine optimized (Apple Silicon)
+airml run -m resnet50.onnx -i cat.jpg -l imagenet_labels.txt -p neural-engine
+```
+
+### Step 5: Benchmark Performance
+
+```bash
+# Compare CPU vs CoreML
+airml bench -m resnet50.onnx -p cpu -n 100
+airml bench -m resnet50.onnx -p coreml -n 100
+airml bench -m resnet50.onnx -p neural-engine -n 100
+```
+
+## Tutorial 2: Text Embeddings
+
+### Step 1: Get an Embedding Model
+
+Download a sentence transformer model (e.g., all-MiniLM-L6-v2):
+
+```bash
+# Using Hugging Face optimum
+pip install optimum[exporters]
+optimum-cli export onnx --model sentence-transformers/all-MiniLM-L6-v2 ./minilm/
+```
+
+This creates:
+- `minilm/model.onnx` - The model
+- `minilm/tokenizer.json` - The tokenizer
+
+### Step 2: Generate Embeddings
+
+```bash
+airml embed \
+  -m minilm/model.onnx \
+  -t minilm/tokenizer.json \
+  --text "Hello, world!"
+
+# Output:
+# {
+#   "text": "Hello, world!",
+#   "dimension": 384,
+#   "embedding": [
+#     0.123456, 0.234567, ...
+#   ]
+# }
+```
+
+### Step 3: Normalize for Similarity Search
+
+```bash
+# L2 normalized embeddings (recommended for cosine similarity)
+airml embed \
+  -m minilm/model.onnx \
+  -t minilm/tokenizer.json \
+  --text "Hello, world!" \
+  --normalize
+```
+
+### Step 4: Different Output Formats
+
+```bash
+# JSON format (default)
+airml embed -m model.onnx -t tokenizer.json --text "Hello" --output json
+
+# Raw format (one number per line)
+airml embed -m model.onnx -t tokenizer.json --text "Hello" --output raw > embedding.txt
+```
+
+## Tutorial 3: Using airML as a Library
+
+### Step 1: Add Dependencies
+
+```toml
+# Cargo.toml
+[dependencies]
+airml-core = { path = "crates/airml-core" }
+airml-preprocess = { path = "crates/airml-preprocess" }
+airml-providers = { path = "crates/airml-providers", features = ["coreml"] }
+```
+
+### Step 2: Image Classification in Code
+
+```rust
+use airml_core::{InferenceEngine, SessionConfig};
+use airml_preprocess::ImagePreprocessor;
+use airml_providers::{auto_select_providers, CoreMLProvider};
+
+fn main() -> anyhow::Result<()> {
+    // Configure with CoreML
+    let providers = vec![CoreMLProvider::default().into_dispatch()];
+    let config = SessionConfig::new().with_providers(providers);
+
+    // Load model
+    let mut engine = InferenceEngine::from_file_with_config("resnet50.onnx", config)?;
+
+    // Preprocess image
+    let preprocessor = ImagePreprocessor::imagenet();
+    let input = preprocessor.load_and_process("cat.jpg")?;
+
+    // Run inference
+    let outputs = engine.run(input.into_dyn())?;
+
+    // Get predictions
+    let logits = &outputs[0];
+    let softmax = softmax(logits);
+    let top_k = top_k_indices(&softmax, 5);
+
+    for (idx, prob) in top_k {
+        println!("{}: {:.2}%", idx, prob * 100.0);
+    }
+
+    Ok(())
+}
+
+fn softmax(logits: &ndarray::ArrayD<f32>) -> Vec<f32> {
+    let max = logits.iter().cloned().fold(f32::NEG_INFINITY, f32::max);
+    let exps: Vec<f32> = logits.iter().map(|x| (x - max).exp()).collect();
+    let sum: f32 = exps.iter().sum();
+    exps.iter().map(|x| x / sum).collect()
+}
+
+fn top_k_indices(probs: &[f32], k: usize) -> Vec<(usize, f32)> {
+    let mut indexed: Vec<_> = probs.iter().copied().enumerate().collect();
+    indexed.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
+    indexed.into_iter().take(k).collect()
+}
+```
+
+### Step 3: Text Embeddings in Code
+
+```rust
+use airml_core::{InferenceEngine, SessionConfig};
+use airml_preprocess::TextPreprocessor;
+use airml_providers::auto_select_providers;
+
+fn main() -> anyhow::Result<()> {
+    // Load tokenizer
+    let preprocessor = TextPreprocessor::from_file("tokenizer.json")?
+        .with_max_length(128);
+
+    // Load model
+    let config = SessionConfig::new().with_providers(auto_select_providers());
+    let mut engine = InferenceEngine::from_file_with_config("model.onnx", config)?;
+
+    // Tokenize
+    let tokenized = preprocessor.encode("Hello, world!")?;
+    let (input_ids, attention_mask) = tokenized.to_array();
+
+    // Run inference
+    let outputs = engine.run_multiple(vec![
+        input_ids.into_dyn().mapv(|x| x as f32),
+        attention_mask.into_dyn().mapv(|x| x as f32),
+    ])?;
+
+    // Extract embedding (assuming [batch, hidden] output)
+    let embedding: Vec<f32> = outputs[0].iter().copied().collect();
+
+    // L2 normalize
+    let norm: f32 = embedding.iter().map(|x| x * x).sum::<f32>().sqrt();
+    let normalized: Vec<f32> = embedding.iter().map(|x| x / norm).collect();
+
+    println!("Embedding dimension: {}", normalized.len());
+    println!("First 5 values: {:?}", &normalized[..5]);
+
+    Ok(())
+}
+```
+
+## Tutorial 4: Embedding Models in Binary
+
+### Step 1: Setup
+
+```rust
+use airml_embed::EmbeddedModel;
+
+// Embed model at compile time
+static MODEL_BYTES: &[u8] = include_bytes!("../models/resnet50.onnx");
+```
+
+### Step 2: Use Embedded Model
+
+```rust
+fn main() -> anyhow::Result<()> {
+    let model = EmbeddedModel::new(MODEL_BYTES);
+    println!("Model size: {} bytes", model.size());
+
+    let engine = model.into_engine()?;
+
+    // Use engine as normal...
+    Ok(())
+}
+```
+
+### Step 3: With Custom Configuration
+
+```rust
+use airml_core::SessionConfig;
+use airml_providers::CoreMLProvider;
+
+fn main() -> anyhow::Result<()> {
+    let config = SessionConfig::new()
+        .with_providers(vec![CoreMLProvider::default().into_dispatch()])
+        .with_intra_threads(4);
+
+    let engine = EmbeddedModel::with_config(MODEL_BYTES, config)
+        .into_engine()?;
+
+    Ok(())
+}
+```
+
+## Tutorial 5: Benchmarking and Optimization
+
+### Step 1: Basic Benchmark
+
+```bash
+airml bench -m model.onnx -n 100 -w 10
+```
+
+Output:
+```
+Results:
+------------------------------------------------------------
+  Total iterations: 100
+  Total time:       1234.567 ms
+
+  Mean latency:     12.346 ms
+  Median latency:   12.100 ms
+  Min latency:      11.200 ms
+  Max latency:      15.800 ms
+  Std deviation:    0.890 ms
+
+  Throughput:       81.00 inferences/sec
+
+  P50:              12.100 ms
+  P90:              13.200 ms
+  P95:              14.100 ms
+  P99:              15.500 ms
+```
+
+### Step 2: Compare Providers
+
+```bash
+# Create a comparison script
+for provider in cpu coreml neural-engine; do
+    echo "=== $provider ==="
+    airml bench -m model.onnx -p $provider -n 100
+done
+```
+
+### Step 3: Optimize Thread Count
+
+```rust
+// Test different thread configurations
+let configs = vec![
+    SessionConfig::new().with_intra_threads(1),
+    SessionConfig::new().with_intra_threads(2),
+    SessionConfig::new().with_intra_threads(4),
+    SessionConfig::new().with_intra_threads(8),
+];
+
+for config in configs {
+    // Benchmark each configuration
+}
+```
+
+## Common Issues
+
+### Model Not Found
+
+```
+Error: Model not found: /path/to/model.onnx
+```
+
+Solution: Check the file path and ensure the model exists.
+
+### Unsupported Operator
+
+```
+Error: Failed to load model: Unsupported operator: CustomOp
+```
+
+Solution: The model uses operators not supported by ONNX Runtime. Try:
+1. Re-export the model with `opset_version=17`
+2. Use a different model architecture
+
+### CoreML Not Available
+
+```
+Warning: CoreML not available, falling back to CPU
+```
+
+Solution:
+1. Ensure you're on macOS
+2. Build with `--features coreml`
+3. Check `airml system` for available providers
+
+### Out of Memory
+
+```
+Error: Failed to allocate memory
+```
+
+Solution:
+1. Reduce batch size
+2. Use a smaller model
+3. Close other applications
+
+## Next Steps
+
+- Read [ARCHITECTURE.md](ARCHITECTURE.md) for internal details
+- Check [API.md](API.md) for full API reference
+- See examples in `/examples` directory

From 6833f8103697dacb4b67088989882acde186a9fa Mon Sep 17 00:00:00 2001
From: rlaope <piyrw9754@gmail.com>
Date: Sat, 31 Jan 2026 17:09:07 +0900
Subject: [PATCH 3/3] fix :: ci rust workflow

---
 .github/workflows/ci.yml | 31 ++++++++++++++++++-------------
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 808dcca..3c0505f 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -18,13 +18,13 @@ jobs:
       - uses: actions/checkout@v4
 
       - name: Install Rust
-        uses: dtolnay/rust-action@stable
+        uses: dtolnay/rust-toolchain@stable
 
       - name: Cache cargo
         uses: Swatinem/rust-cache@v2
 
       - name: Check
-        run: cargo check --all-features
+        run: cargo check --features nlp
 
   test:
     name: Test
@@ -38,13 +38,18 @@ jobs:
       - uses: actions/checkout@v4
 
       - name: Install Rust
-        uses: dtolnay/rust-action@${{ matrix.rust }}
+        uses: dtolnay/rust-toolchain@${{ matrix.rust }}
 
       - name: Cache cargo
         uses: Swatinem/rust-cache@v2
 
-      - name: Run tests
-        run: cargo test --all-features
+      - name: Run tests (Linux)
+        if: runner.os == 'Linux'
+        run: cargo test --features nlp
+
+      - name: Run tests (macOS)
+        if: runner.os == 'macOS'
+        run: cargo test --features coreml,nlp
 
   fmt:
     name: Format
@@ -53,7 +58,7 @@ jobs:
       - uses: actions/checkout@v4
 
       - name: Install Rust
-        uses: dtolnay/rust-action@stable
+        uses: dtolnay/rust-toolchain@stable
         with:
           components: rustfmt
 
@@ -67,7 +72,7 @@ jobs:
       - uses: actions/checkout@v4
 
       - name: Install Rust
-        uses: dtolnay/rust-action@stable
+        uses: dtolnay/rust-toolchain@stable
         with:
           components: clippy
 
@@ -75,7 +80,7 @@ jobs:
         uses: Swatinem/rust-cache@v2
 
       - name: Clippy
-        run: cargo clippy --all-features -- -D warnings
+        run: cargo clippy --features nlp -- -D warnings
 
   build-macos:
     name: Build macOS
@@ -84,7 +89,7 @@ jobs:
       - uses: actions/checkout@v4
 
       - name: Install Rust
-        uses: dtolnay/rust-action@stable
+        uses: dtolnay/rust-toolchain@stable
         with:
           targets: aarch64-apple-darwin
 
@@ -92,10 +97,10 @@ jobs:
         uses: Swatinem/rust-cache@v2
 
       - name: Build (x86_64)
-        run: cargo build --release
+        run: cargo build --release --features coreml,nlp
 
       - name: Build (aarch64)
-        run: cargo build --release --target aarch64-apple-darwin
+        run: cargo build --release --target aarch64-apple-darwin --features coreml,nlp
         if: runner.arch == 'ARM64'
 
       - name: Check binary size
@@ -110,13 +115,13 @@ jobs:
       - uses: actions/checkout@v4
 
       - name: Install Rust
-        uses: dtolnay/rust-action@stable
+        uses: dtolnay/rust-toolchain@stable
 
       - name: Cache cargo
         uses: Swatinem/rust-cache@v2
 
       - name: Build
-        run: cargo build --release
+        run: cargo build --release --features nlp
 
       - name: Check binary size
         run: |