meta-pytorch · seyeong-han · Feb 11, 2026 · Feb 11, 2026 · Feb 12, 2026 · Feb 12, 2026
diff --git a/multimodal/README.md b/multimodal/README.md
@@ -0,0 +1,13 @@
+# Multimodal ExecuTorch Examples
+
+This directory contains examples demonstrating multimodal AI inference using ExecuTorch with various backends (XNNPACK, Metal).
+
+## Projects
+
+| Directory | Description | Model |
+|-----------|-------------|-------|
+| [ask-anything-app](./ask-anything-app) | Web app with camera + chat interface | Gemma3 Vision + Whisper |
+| [text-runtime](./text-runtime) | Text generation | Qwen3-0.6B |
+| [text-image-runtime](./text-image-runtime) | Vision-language inference | Gemma3 4B |
+| [voice-runtime](./voice-runtime) | Speech-to-text | Whisper Tiny |
+| [object-detection-runtime](./object-detection-runtime) | Object detection | YOLO26m |
diff --git a/multimodal/ask-anything-app/.gitignore b/multimodal/ask-anything-app/.gitignore
@@ -0,0 +1,40 @@
+# Logs
+logs
+*.log
+npm-debug.log*
+yarn-debug.log*
+yarn-error.log*
+pnpm-debug.log*
+lerna-debug.log*
+
+node_modules
+dist
+dist-ssr
+*.local
+
+# Editor directories and files
+.vscode/*
+!.vscode/extensions.json
+.idea
+.DS_Store
+*.suo
+*.ntvs*
+*.njsproj
+*.sln
+*.sw?
+
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+.venv/
+venv/
+env/
+
+# Model files
+*.pte
+*.bin
+*.onnx
+*.pt
+*.pth
+*.safetensors
diff --git a/multimodal/ask-anything-app/README.md b/multimodal/ask-anything-app/README.md
@@ -0,0 +1,95 @@
+# Ask Anything - Multimodal Web Dashboard
+
+A two-column web dashboard with real-time camera streaming and Facebook-style chat interface, powered by **Gemma3** (vision-language) and **Whisper** (speech-to-text) ExecuTorch runtimes.
+
+## Features
+
+- Real-time camera streaming with frame capture
+- Facebook-style chat interface (blue user bubbles, gray AI bubbles)
+- Vision-language understanding via Gemma3 4B
+- Speech-to-text transcription via Whisper (optional)
+- Models loaded at startup for fast inference
+
+## Quick Start
+
+### 1. Start the Backend
+
+```bash
+# From the ask-anything-app directory
+cd backend
+
+# Install Python dependencies (if not already)
+pip install -r ../requirements.txt
+
+# Start the FastAPI server
+python -m uvicorn main:app --reload --port 8000
+```
+
+The backend will load the Gemma3 and Whisper models at startup.
+
+### 2. Start the Frontend
+
+```bash
+# From the ask-anything-app directory
+npm install  # Install dependencies (first time only)
+npm run dev  # Start the dev server
+```
+
+### 3. Open the App
+
+Navigate to http://localhost:5173 in your browser.
+
+- Allow camera access when prompted
+- Type a question and press Enter
+- The current camera frame will be sent to Gemma3 for analysis
+
+## Project Structure
+
+```
+ask-anything-app/
+├── backend/                    # FastAPI backend
+│   ├── main.py                 # App entry point
+│   ├── config.py               # Model paths
+│   ├── modules/                # Inference modules
+│   │   ├── base.py             # BaseModule interface
+│   │   ├── multimodal/         # Gemma3 module
+│   │   └── voice/              # Whisper module
+│   └── routers/                # API endpoints
+│       ├── health.py           # Health check
+│       ├── vision.py           # Vision inference
+│       └── speech.py           # Speech transcription
+├── src/                        # React frontend
+│   ├── components/             # UI components
+│   │   ├── layout/             # SplitLayout
+│   │   ├── camera/             # CameraStream
+│   │   └── chat/               # ChatInterface
+│   ├── contexts/               # Zustand store
+│   ├── hooks/                  # Custom hooks
+│   ├── services/               # API client
+│   └── types/                  # TypeScript types
+├── package.json
+└── requirements.txt
+```
+
+## API Endpoints
+
+| Endpoint | Method | Description |
+|----------|--------|-------------|
+| `/api/health` | GET | Health check |
+| `/api/status` | GET | Model status |
+| `/api/vision/infer` | POST | Vision-language inference |
+| `/api/speech/transcribe` | POST | Speech-to-text |
+
+## Configuration
+
+Model paths are configured in `backend/config.py`:
+
+- **Gemma3**: `../text-image-runtime/gemma3/GEMMA3_4B_XNNPACK_INT8_INT4.pte`
+- **Whisper**: `../voice-runtime/models/whisper-tiny-ExecuTorch-XNNPACK/`
+
+## Tech Stack
+
+- **Frontend**: React 19 + TypeScript + Vite + Tailwind CSS
+- **State**: Zustand
+- **Backend**: FastAPI + Uvicorn
+- **ML Runtime**: ExecuTorch
diff --git a/multimodal/ask-anything-app/backend/__init__.py b/multimodal/ask-anything-app/backend/__init__.py
@@ -0,0 +1 @@
+# Ask Anything Backend
diff --git a/multimodal/ask-anything-app/backend/config.py b/multimodal/ask-anything-app/backend/config.py
@@ -0,0 +1,26 @@
+"""Configuration for Ask Anything backend."""
+from pathlib import Path
+
+# Base paths
+APP_DIR = Path(__file__).parent.parent
+MULTIMODAL_DIR = APP_DIR.parent
+
+# Model paths
+GEMMA3_MODEL_PATH = str(
+    MULTIMODAL_DIR / "text-image-runtime" / "gemma3" / "GEMMA3_4B_XNNPACK_INT8_INT4.pte"
+)
+GEMMA3_PROCESSOR_PATH = str(MULTIMODAL_DIR / "text-image-runtime" / "gemma3")
+GEMMA3_HF_MODEL_ID = "google/gemma-3-4b-it"
+
+WHISPER_MODEL_DIR = str(
+    MULTIMODAL_DIR / "voice-runtime" / "models" / "whisper-tiny-ExecuTorch-XNNPACK"
+)
+
+# Server config
+HOST = "0.0.0.0"
+PORT = 8000
+CORS_ORIGINS = [
+    "http://localhost:5173",
+    "http://127.0.0.1:5173",
+    "http://localhost:3000",
+]
diff --git a/multimodal/ask-anything-app/backend/main.py b/multimodal/ask-anything-app/backend/main.py
@@ -0,0 +1,122 @@
+"""
+Ask Anything Backend - FastAPI server for multimodal inference.
+
+This server loads Gemma3 (vision-language) and Whisper (speech-to-text) models
+at startup and provides REST API endpoints for inference.
+
+Usage:
+    From ask-anything-app directory:
+        python -m uvicorn backend.main:app --reload --port 8000
+
+    Or from backend directory:
+        python -m uvicorn main:app --reload --port 8000
+"""
+from contextlib import asynccontextmanager
+
+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+import uvicorn
+
+# Use try/except to handle both relative and absolute imports
+try:
+    from .config import HOST, PORT, CORS_ORIGINS
+    from .modules.multimodal import Gemma3Module
+    from .modules.voice import WhisperModule
+    from .routers import health, vision, speech
+except ImportError:
+    from config import HOST, PORT, CORS_ORIGINS
+    from modules.multimodal import Gemma3Module
+    from modules.voice import WhisperModule
+    from routers import health, vision, speech
+
+# Global module instances (loaded at startup)
+gemma3_module = Gemma3Module()
+whisper_module = WhisperModule()
+
+
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """Load models at startup, cleanup on shutdown."""
+    print("=" * 60)
+    print("Ask Anything Backend - Starting up...")
+    print("=" * 60)
+
+    # Load Gemma3 (slower - 3.5GB model)
+    print("\n[1/2] Loading Gemma3 vision-language model...")
+    try:
+        gemma3_module.load()
+        print("  ✓ Gemma3 loaded successfully")
+    except Exception as e:
+        import traceback
+        print(f"  ✗ Failed to load Gemma3: {e}")
+        traceback.print_exc()
+
+    # Load Whisper (faster - 231MB model)
+    print("\n[2/2] Loading Whisper speech-to-text model...")
+    try:
+        whisper_module.load()
+        print("  ✓ Whisper loaded successfully")
+    except Exception as e:
+        import traceback
+        print(f"  ✗ Failed to load Whisper: {e}")
+        traceback.print_exc()
+
+    print("\n" + "=" * 60)
+    print("Server ready!")
+    print(f"  Gemma3: {'✓ Loaded' if gemma3_module.is_loaded else '✗ Not loaded'}")
+    print(f"  Whisper: {'✓ Loaded' if whisper_module.is_loaded else '✗ Not loaded'}")
+    print("=" * 60)
+
+    yield
+
+    # Cleanup on shutdown
+    print("\nShutting down...")
+    gemma3_module.unload()
+    whisper_module.unload()
+    print("Goodbye!")
+
+
+# Create FastAPI app
+app = FastAPI(
+    title="Ask Anything API",
+    description="Multimodal inference API for vision-language and speech-to-text using ExecuTorch",
+    version="1.0.0",
+    lifespan=lifespan,
+)
+
+# Configure CORS for React dev server
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=CORS_ORIGINS,
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+# Store module references in app state for access in routes
+app.state.gemma3 = gemma3_module
+app.state.whisper = whisper_module
+
+# Include routers
+app.include_router(health.router, prefix="/api", tags=["Health"])
+app.include_router(vision.router, prefix="/api/vision", tags=["Vision"])
+app.include_router(speech.router, prefix="/api/speech", tags=["Speech"])
+
+
+@app.get("/")
+async def root():
+    """Root endpoint with API information."""
+    return {
+        "name": "Ask Anything API",
+        "version": "1.0.0",
+        "endpoints": {
+            "health": "/api/health",
+            "status": "/api/status",
+            "vision": "/api/vision/infer",
+            "speech": "/api/speech/transcribe",
+        },
+    }
+
+
+if __name__ == "__main__":
+    uvicorn.run(app, host=HOST, port=PORT)
diff --git a/multimodal/ask-anything-app/backend/modules/__init__.py b/multimodal/ask-anything-app/backend/modules/__init__.py
@@ -0,0 +1,7 @@
+"""Inference modules for Ask Anything."""
+try:
+    from .base import BaseModule
+except ImportError:
+    from base import BaseModule
+
+__all__ = ["BaseModule"]
diff --git a/multimodal/ask-anything-app/backend/modules/base.py b/multimodal/ask-anything-app/backend/modules/base.py
@@ -0,0 +1,53 @@
+"""Abstract base class for inference modules."""
+from abc import ABC, abstractmethod
+from typing import Any, Dict
+
+
+class BaseModule(ABC):
+    """Abstract base class for all inference modules.
+
+    All modules must implement load(), unload(), and infer() methods
+    to provide a consistent interface.
+    """
+
+    def __init__(self):
+        self._loaded = False
+        self._model = None
+
+    @abstractmethod
+    def load(self, **kwargs) -> None:
+        """Load the model into memory.
+
+        Args:
+            **kwargs: Model-specific configuration (paths, etc.)
+        """
+        pass
+
+    @abstractmethod
+    def unload(self) -> None:
+        """Unload the model and free resources."""
+        pass
+
+    @abstractmethod
+    def infer(self, **kwargs) -> Any:
+        """Run inference on input data.
+
+        Args:
+            **kwargs: Model-specific input parameters
+
+        Returns:
+            Model-specific output
+        """
+        pass
+
+    @property
+    def is_loaded(self) -> bool:
+        """Check if the model is loaded."""
+        return self._loaded
+
+    def get_status(self) -> Dict[str, Any]:
+        """Get module status information."""
+        return {
+            "loaded": self._loaded,
+            "model_type": self.__class__.__name__,
+        }
diff --git a/multimodal/ask-anything-app/backend/modules/multimodal/__init__.py b/multimodal/ask-anything-app/backend/modules/multimodal/__init__.py
@@ -0,0 +1,7 @@
+"""Multimodal (vision-language) modules."""
+try:
+    from .gemma3_module import Gemma3Module
+except ImportError:
+    from gemma3_module import Gemma3Module
+
+__all__ = ["Gemma3Module"]