diff --git a/Makefile b/Makefile index 28abe8e6d..09cf4c69e 100644 --- a/Makefile +++ b/Makefile @@ -76,7 +76,9 @@ help: @echo " verify-imports - Verify all package imports work" @echo " show-versions - Show versions of all packages" @echo " convert-execute-notebooks - Convert notebooks from .py to .ipynb using jupytext" - @echo " generate-colab-notebooks - Generate Colab-compatible notebooks" + @echo " generate-colab-notebooks - Generate Colab-compatible notebooks" + @echo " generate-fern-notebooks - Convert notebooks to Fern format for docs" + @echo " generate-fern-notebooks-with-outputs - Execute notebooks first, then convert (requires API key)" @echo " serve-docs-locally - Serve documentation locally" @echo " check-license-headers - Check if all files have license headers" @echo " update-license-headers - Add license headers to all files" @@ -470,9 +472,29 @@ convert-execute-notebooks: generate-colab-notebooks: @echo "πŸ““ Generating Colab-compatible notebooks..." - uv run --group docs python docs/scripts/generate_colab_notebooks.py + @if [ -d docs/notebooks ] && [ -n "$$(ls docs/notebooks/*.ipynb 2>/dev/null)" ]; then \ + echo " Using executed notebooks from docs/notebooks (outputs preserved)"; \ + uv run --group docs python docs/scripts/generate_colab_notebooks.py --executed-dir docs/notebooks; \ + else \ + echo " Using source only (run 'make convert-execute-notebooks' first for outputs)"; \ + uv run --group docs python docs/scripts/generate_colab_notebooks.py; \ + fi @echo "βœ… Colab notebooks created in docs/colab_notebooks/" +generate-fern-notebooks: generate-colab-notebooks + @echo "πŸ““ Converting notebooks to Fern format for NotebookViewer..." + @mkdir -p fern/components/notebooks + @for f in docs/colab_notebooks/*.ipynb; do \ + if [ -f "$$f" ]; then \ + name=$$(basename "$$f" .ipynb); \ + uv run python fern/scripts/ipynb-to-fern-json.py "$$f" -o fern/components/notebooks/$$name.json; \ + fi; \ + done + @echo "βœ… Fern notebooks created in fern/components/notebooks/" + +generate-fern-notebooks-with-outputs: convert-execute-notebooks + $(MAKE) generate-fern-notebooks + # ============================================================================== # PERFORMANCE # ============================================================================== diff --git a/docs/RFC-FERN-MIGRATION.md b/docs/RFC-FERN-MIGRATION.md new file mode 100644 index 000000000..6964b2015 --- /dev/null +++ b/docs/RFC-FERN-MIGRATION.md @@ -0,0 +1,1371 @@ +# RFC: Migration from MkDocs to Fern Docs + +**Status:** Draft +**Author:** [Author Name] +**Owner:** [Owner Name] +**Created:** 2026-01-14 +**Last Updated:** 2026-01-14 +**Target Completion:** [YYYY-MM-DD] + +--- + +## Summary + +This RFC proposes migrating the NeMo Data Designer documentation from MkDocs Material to [Fern Docs](https://buildwithfern.com/learn/docs/getting-started/overview). The migration will be performed incrementally by creating a new `docs-fern/` directory, preserving all existing content while adapting to Fern's component system. + +## Motivation + +This migration is **mandated** as part of NVIDIA's documentation platform standardization initiative. + +**Additional benefits:** + +- **Modern documentation platform**: Fern offers AI-native features including Ask Fern and auto-generated MCP servers +- **Enhanced API documentation**: Better support for API reference documentation from OpenAPI specs +- **Improved developer experience**: Rich component library with interactive elements +- **Self-hosting options**: Flexible deployment for enterprise requirements + +## Scope + +### In Scope + +- 1:1 content migration (no content changes) +- Component mapping from MkDocs Material to Fern equivalents +- Navigation structure preservation +- Code reference documentation migration + +### Out of Scope + +- Content rewrites or restructuring +- New features or sections +- Removal of existing documentation + +--- + +## Current Documentation Inventory + +### File Structure + +``` +docs/ +β”œβ”€β”€ index.md # Home page +β”œβ”€β”€ installation.md # Installation guide +β”œβ”€β”€ quick-start.md # Quick start tutorial +β”œβ”€β”€ CONTRIBUTING.md # Contribution guide +β”œβ”€β”€ concepts/ +β”‚ β”œβ”€β”€ columns.md +β”‚ β”œβ”€β”€ validators.md +β”‚ β”œβ”€β”€ processors.md +β”‚ β”œβ”€β”€ person_sampling.md +β”‚ └── models/ +β”‚ β”œβ”€β”€ default-model-settings.md +β”‚ β”œβ”€β”€ custom-model-settings.md +β”‚ β”œβ”€β”€ configure-model-settings-with-the-cli.md +β”‚ β”œβ”€β”€ model-providers.md +β”‚ β”œβ”€β”€ model-configs.md +β”‚ └── inference-parameters.md +β”œβ”€β”€ recipes/ +β”‚ β”œβ”€β”€ cards.md +β”‚ β”œβ”€β”€ code_generation/ +β”‚ β”‚ β”œβ”€β”€ text_to_python.md +β”‚ β”‚ └── text_to_sql.md +β”‚ └── qa_and_chat/ +β”‚ β”œβ”€β”€ product_info_qa.md +β”‚ └── multi_turn_chat.md +β”œβ”€β”€ plugins/ +β”‚ β”œβ”€β”€ overview.md +β”‚ β”œβ”€β”€ example.md +β”‚ └── available.md +β”œβ”€β”€ code_reference/ # Auto-generated API docs +β”‚ β”œβ”€β”€ models.md +β”‚ β”œβ”€β”€ column_configs.md +β”‚ β”œβ”€β”€ config_builder.md +β”‚ β”œβ”€β”€ data_designer_config.md +β”‚ β”œβ”€β”€ sampler_params.md +β”‚ β”œβ”€β”€ validator_params.md +β”‚ β”œβ”€β”€ processors.md +β”‚ └── analysis.md +β”œβ”€β”€ colab_notebooks/ # Jupyter notebooks +β”‚ β”œβ”€β”€ 1-the-basics.ipynb +β”‚ β”œβ”€β”€ 2-structured-outputs-and-jinja-expressions.ipynb +β”‚ β”œβ”€β”€ 3-seeding-with-a-dataset.ipynb +β”‚ └── 4-providing-images-as-context.ipynb +β”œβ”€β”€ assets/ +β”‚ └── recipes/ # Downloadable code files +β”œβ”€β”€ css/ # Custom styles +β”œβ”€β”€ js/ # Custom scripts +└── overrides/ # MkDocs template overrides +``` + +### Current Navigation Structure + +```yaml +nav: + - Getting Started: + - Welcome: index.md + - Installation: installation.md + - Quick Start: quick-start.md + - Contributing: CONTRIBUTING.md + - Concepts: + - Models: (6 sub-pages) + - Columns: concepts/columns.md + - Validators: concepts/validators.md + - Processors: concepts/processors.md + - Person Sampling: concepts/person_sampling.md + - Tutorials: + - Overview + 4 Jupyter notebooks + - Recipes: + - Recipe Cards + 4 recipes + - Plugins: + - 3 pages + - Code Reference: + - 8 auto-generated API docs +``` + +--- + +## Component Mapping + +### MkDocs β†’ Fern Component Equivalents + +Reference: [Fern Components Overview](https://buildwithfern.com/learn/docs/writing-content/components/overview) + +| MkDocs Feature | Current Syntax | Fern Equivalent | Notes | +|----------------|----------------|-----------------|-------| +| **Admonitions** | `!!! note "Title"` | ``, ``, ``, `` | See [Callouts](#1-admonitions--callouts) | +| **Tabbed Content** | `=== "Tab 1"` | `` + `` | See [Tabs](#2-tabbed-content) | +| **Code Blocks** | ` ```python ` | ` ```python ` | Direct compatibility | +| **Code Snippets** | `--8<-- "path"` | `` with `src` | File embedding | +| **Grid Cards** | `
` | `` + `` | See [Cards](#3-grid-cards) | +| **Icons** | `:material-xxx:` | Fern icons or inline SVG | Limited support | +| **Download Links** | `{ .md-button download=}` | Standard markdown links | Simplified | +| **API Docs** | `::: module.path` | Manual or OpenAPI import | See [API Reference](#4-api-reference) | +| **Jupyter Notebooks** | `.ipynb` files | Convert to MDX or embed | See [Notebooks](#5-jupyter-notebooks) | +| **Versioning** | Mike plugin | Fern versioning | Built-in support | + +--- + +## Detailed Component Migrations + +### 1. Admonitions β†’ Callouts + +**Current MkDocs syntax:** + +```markdown +!!! note "The Declarative Approach" + Columns are **declarative specifications**. You describe *what* you want... + +!!! tip "Conditional Sampling" + Samplers support **conditional parameters**... + +!!! question "New to Data Designer?" + Recipes provide working code... + +!!! warning "Important" + This action cannot be undone. +``` + +**Fern equivalent:** + +```mdx + +Columns are **declarative specifications**. You describe *what* you want... + + + +Samplers support **conditional parameters**... + + + +Recipes provide working code... + + + +This action cannot be undone. + +``` + +**Migration mapping:** + +| MkDocs Admonition | Fern Callout | +|-------------------|--------------| +| `!!! note` | `` | +| `!!! tip` | `` | +| `!!! info` | `` | +| `!!! warning` | `` | +| `!!! question` | `` | +| `!!! danger` | `` | + +### 2. Tabbed Content + +**Current MkDocs syntax (installation.md):** + +```markdown +=== "pip" + + ```bash + pip install data-designer + ``` + +=== "uv" + + ```bash + uv add data-designer + ``` +``` + +**Fern equivalent:** + +```mdx + + + ```bash + pip install data-designer + ``` + + + ```bash + uv add data-designer + ``` + + +``` + +### 3. Grid Cards + +**Current MkDocs syntax (recipes/cards.md):** + +```markdown +
+ +- :material-snake:{ .lg .middle } **Text to Python** + + Generate a dataset of natural language instructions... + + --- + + **Demonstrates:** + - Python code generation + - Python code validation + + --- + + [:material-book-open-page-variant: View Recipe](code_generation/text_to_python.md){ .md-button } + +
+``` + +**Fern equivalent:** + +```mdx + + + Generate a dataset of natural language instructions... + + **Demonstrates:** + - Python code generation + - Python code validation + + +``` + +### 4. API Reference (mkdocstrings) + +**Current MkDocs syntax (code_reference/models.md):** + +```markdown +# Models + +The `models` module defines configuration objects... + +::: data_designer.config.models +``` + +**Fern options:** + +**Option A: Manual Documentation** +Convert auto-generated docs to manually written MDX with code examples. + +**Option B: OpenAPI Integration** +If the API has an OpenAPI spec, use Fern's native API reference generation. + +**Option C: TypeDoc/PyDoc Integration** +Use Fern's SDK documentation features if available. + +**Recommendation:** Start with Option A (manual) for initial migration, evaluate automation options post-migration. + +### 5. Jupyter Notebooks + +**Current approach:** `mkdocs-jupyter` plugin renders `.ipynb` files directly. + +**Fern options:** + +**Option A: Convert to MDX** +Convert notebooks to MDX files with code blocks and output screenshots. + +**Option B: Embed as iframes** +Host notebooks on Colab/GitHub and embed links. + +**Option C: Use Fern's code playground** +If available, use interactive code features. + +**Recommendation:** Convert to MDX with static code blocks and link to Colab for interactive experience (preserves current Colab badge functionality). + +### 6. Code Snippets (pymdownx.snippets) + +**Current MkDocs syntax:** + +```markdown +```python +--8<-- "assets/recipes/code_generation/text_to_python.py" +``` +``` + +**Fern equivalent:** + +```mdx + +``` + +Or inline the code directly if file embedding isn't supported. + +--- + +## Proposed Directory Structure + +``` +docs-fern/ +β”œβ”€β”€ fern.config.json # Fern configuration +β”œβ”€β”€ docs.yml # Navigation and settings +β”œβ”€β”€ pages/ +β”‚ β”œβ”€β”€ index.mdx # Home page +β”‚ β”œβ”€β”€ installation.mdx +β”‚ β”œβ”€β”€ quick-start.mdx +β”‚ β”œβ”€β”€ contributing.mdx +β”‚ β”œβ”€β”€ concepts/ +β”‚ β”‚ β”œβ”€β”€ columns.mdx +β”‚ β”‚ β”œβ”€β”€ validators.mdx +β”‚ β”‚ β”œβ”€β”€ processors.mdx +β”‚ β”‚ β”œβ”€β”€ person-sampling.mdx +β”‚ β”‚ └── models/ +β”‚ β”‚ β”œβ”€β”€ default-model-settings.mdx +β”‚ β”‚ β”œβ”€β”€ custom-model-settings.mdx +β”‚ β”‚ β”œβ”€β”€ configure-with-cli.mdx +β”‚ β”‚ β”œβ”€β”€ model-providers.mdx +β”‚ β”‚ β”œβ”€β”€ model-configs.mdx +β”‚ β”‚ └── inference-parameters.mdx +β”‚ β”œβ”€β”€ tutorials/ +β”‚ β”‚ β”œβ”€β”€ overview.mdx +β”‚ β”‚ β”œβ”€β”€ the-basics.mdx +β”‚ β”‚ β”œβ”€β”€ structured-outputs.mdx +β”‚ β”‚ β”œβ”€β”€ seeding-with-dataset.mdx +β”‚ β”‚ └── images-as-context.mdx +β”‚ β”œβ”€β”€ recipes/ +β”‚ β”‚ β”œβ”€β”€ index.mdx # Recipe cards +β”‚ β”‚ β”œβ”€β”€ code-generation/ +β”‚ β”‚ β”‚ β”œβ”€β”€ text-to-python.mdx +β”‚ β”‚ β”‚ └── text-to-sql.mdx +β”‚ β”‚ └── qa-and-chat/ +β”‚ β”‚ β”œβ”€β”€ product-info-qa.mdx +β”‚ β”‚ └── multi-turn-chat.mdx +β”‚ β”œβ”€β”€ plugins/ +β”‚ β”‚ β”œβ”€β”€ overview.mdx +β”‚ β”‚ β”œβ”€β”€ example.mdx +β”‚ β”‚ └── available.mdx +β”‚ └── api-reference/ +β”‚ β”œβ”€β”€ models.mdx +β”‚ β”œβ”€β”€ column-configs.mdx +β”‚ β”œβ”€β”€ config-builder.mdx +β”‚ β”œβ”€β”€ data-designer-config.mdx +β”‚ β”œβ”€β”€ sampler-params.mdx +β”‚ β”œβ”€β”€ validator-params.mdx +β”‚ β”œβ”€β”€ processors.mdx +β”‚ └── analysis.mdx +β”œβ”€β”€ assets/ +β”‚ β”œβ”€β”€ favicon.png +β”‚ └── recipes/ # Downloadable code files +β”‚ β”œβ”€β”€ code_generation/ +β”‚ └── qa_and_chat/ +└── styles/ + └── custom.css # Custom styling (if needed) +``` + +--- + +## URL Redirect Mapping + +To preserve existing bookmarks and SEO, all old URLs must redirect to their new locations. + +### Redirect Rules + +| Old MkDocs URL | New Fern URL | +|----------------|--------------| +| `/` | `/docs` | +| `/installation/` | `/docs/installation` | +| `/quick-start/` | `/docs/quick-start` | +| `/CONTRIBUTING/` | `/docs/contributing` | +| `/concepts/columns/` | `/docs/concepts/columns` | +| `/concepts/validators/` | `/docs/concepts/validators` | +| `/concepts/processors/` | `/docs/concepts/processors` | +| `/concepts/person_sampling/` | `/docs/concepts/person-sampling` | +| `/concepts/models/default-model-settings/` | `/docs/concepts/models/default-model-settings` | +| `/concepts/models/custom-model-settings/` | `/docs/concepts/models/custom-model-settings` | +| `/concepts/models/configure-model-settings-with-the-cli/` | `/docs/concepts/models/configure-with-cli` | +| `/concepts/models/model-providers/` | `/docs/concepts/models/model-providers` | +| `/concepts/models/model-configs/` | `/docs/concepts/models/model-configs` | +| `/concepts/models/inference-parameters/` | `/docs/concepts/models/inference-parameters` | +| `/tutorials/` | `/docs/tutorials/overview` | +| `/recipes/cards/` | `/docs/recipes` | +| `/recipes/code_generation/text_to_python/` | `/docs/recipes/code-generation/text-to-python` | +| `/recipes/code_generation/text_to_sql/` | `/docs/recipes/code-generation/text-to-sql` | +| `/recipes/qa_and_chat/product_info_qa/` | `/docs/recipes/qa-and-chat/product-info-qa` | +| `/recipes/qa_and_chat/multi_turn_chat/` | `/docs/recipes/qa-and-chat/multi-turn-chat` | +| `/plugins/overview/` | `/docs/plugins/overview` | +| `/plugins/example/` | `/docs/plugins/example` | +| `/plugins/available/` | `/docs/plugins/available` | +| `/code_reference/*` | `/api/*` | + +### Implementation + +**Option A: Fern redirects configuration** (if supported) + +```yaml +# In docs.yml +redirects: + - from: /concepts/person_sampling + to: /docs/concepts/person-sampling + # ... additional redirects +``` + +**Option B: Hosting platform redirects** + +For Netlify (`_redirects` file): +``` +/concepts/person_sampling/* /docs/concepts/person-sampling/:splat 301 +/code_reference/* /api/:splat 301 +``` + +For nginx: +```nginx +rewrite ^/concepts/person_sampling(.*)$ /docs/concepts/person-sampling$1 permanent; +rewrite ^/code_reference/(.*)$ /api/$1 permanent; +``` + +--- + +## Configuration Files + +### fern.config.json + +```json +{ + "organization": "nvidia-nemo", + "version": "1.0.0" +} +``` + +### docs.yml + +```yaml +instances: + - url: https://datadesigner.docs.nvidia.com + +title: NeMo Data Designer + +tabs: + docs: + display-name: Documentation + slug: docs + api: + display-name: API Reference + slug: api + +navigation: + - tab: docs + layout: + - section: Getting Started + contents: + - page: Welcome + path: pages/index.mdx + - page: Installation + path: pages/installation.mdx + - page: Quick Start + path: pages/quick-start.mdx + - page: Contributing + path: pages/contributing.mdx + - section: Concepts + contents: + - section: Models + contents: + - page: Default Model Settings + path: pages/concepts/models/default-model-settings.mdx + - page: Custom Model Settings + path: pages/concepts/models/custom-model-settings.mdx + - page: Configure with CLI + path: pages/concepts/models/configure-with-cli.mdx + - page: Model Providers + path: pages/concepts/models/model-providers.mdx + - page: Model Configs + path: pages/concepts/models/model-configs.mdx + - page: Inference Parameters + path: pages/concepts/models/inference-parameters.mdx + - page: Columns + path: pages/concepts/columns.mdx + - page: Validators + path: pages/concepts/validators.mdx + - page: Processors + path: pages/concepts/processors.mdx + - page: Person Sampling + path: pages/concepts/person-sampling.mdx + - section: Tutorials + contents: + - page: Overview + path: pages/tutorials/overview.mdx + - page: The Basics + path: pages/tutorials/the-basics.mdx + - page: Structured Outputs + path: pages/tutorials/structured-outputs.mdx + - page: Seeding with a Dataset + path: pages/tutorials/seeding-with-dataset.mdx + - page: Images as Context + path: pages/tutorials/images-as-context.mdx + - section: Recipes + contents: + - page: Recipe Cards + path: pages/recipes/index.mdx + - section: Code Generation + contents: + - page: Text to Python + path: pages/recipes/code-generation/text-to-python.mdx + - page: Text to SQL + path: pages/recipes/code-generation/text-to-sql.mdx + - section: QA and Chat + contents: + - page: Product Info QA + path: pages/recipes/qa-and-chat/product-info-qa.mdx + - page: Multi-Turn Chat + path: pages/recipes/qa-and-chat/multi-turn-chat.mdx + - section: Plugins + contents: + - page: Overview + path: pages/plugins/overview.mdx + - page: Example Plugin + path: pages/plugins/example.mdx + - page: Available Plugins + path: pages/plugins/available.mdx + - tab: api + layout: + - section: API Reference + contents: + - page: Models + path: pages/api-reference/models.mdx + - page: Column Configs + path: pages/api-reference/column-configs.mdx + - page: Config Builder + path: pages/api-reference/config-builder.mdx + - page: Data Designer Config + path: pages/api-reference/data-designer-config.mdx + - page: Sampler Params + path: pages/api-reference/sampler-params.mdx + - page: Validator Params + path: pages/api-reference/validator-params.mdx + - page: Processors + path: pages/api-reference/processors.mdx + - page: Analysis + path: pages/api-reference/analysis.mdx + +colors: + accent-primary: + dark: "#76B900" + light: "#76B900" + background: + dark: "#1a1a1a" + light: "#ffffff" + +logo: + dark: assets/favicon.png + light: assets/favicon.png + +favicon: assets/favicon.png + +navbar-links: + - type: github + value: https://github.com/NVIDIA-NeMo/DataDesigner +``` + +--- + +## Migration Plan + +### Phase 1: Setup (1 day) + +1. Create `docs-fern/` directory structure +2. Initialize Fern configuration files +3. Set up local development environment +4. Verify Fern CLI works (`fern check`, `fern generate`) + +### Phase 2: Core Pages Migration (2-3 days) + +1. Migrate Getting Started section + - `index.md` β†’ `index.mdx` + - `installation.md` β†’ `installation.mdx` + - `quick-start.md` β†’ `quick-start.mdx` + - `CONTRIBUTING.md` β†’ `contributing.mdx` + +2. Migrate Concepts section (6 model pages + 4 concept pages) + +3. Migrate Plugins section (3 pages) + +### Phase 3: Complex Content Migration (3-4 days) + +1. Convert Jupyter notebooks to MDX + - Extract code cells as code blocks + - Convert markdown cells directly + - Add Colab badges/links + +2. Migrate Recipes section + - Convert grid cards to Fern Cards + - Migrate recipe content pages + - Handle code snippet embedding + +### Phase 4: API Reference Migration (2-3 days) + +1. Extract API documentation from mkdocstrings output +2. Manually format as MDX pages +3. Add code examples and cross-references + +### Phase 5: Styling and Polish (1-2 days) + +1. Apply NVIDIA branding (green accent color) +2. Configure navigation and tabs +3. Add favicon and logos +4. Test responsive design + +### Phase 6: Testing and Validation (1-2 days) + +1. Review all pages for rendering issues +2. Verify all links work +3. Test navigation flow +4. Compare against original docs for completeness + +--- + +## CI/CD Pipeline Changes + +### Current MkDocs Pipeline + +```yaml +# Current workflow (to be replaced) +- name: Build docs + run: mkdocs build + +- name: Deploy docs + run: mkdocs gh-deploy +``` + +### New Fern Pipeline + +```yaml +# .github/workflows/docs.yml +name: Documentation + +on: + push: + branches: [main] + paths: + - 'docs-fern/**' + pull_request: + paths: + - 'docs-fern/**' + +jobs: + docs: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Setup Node.js + uses: actions/setup-node@v4 + with: + node-version: '20' + + - name: Install Fern CLI + run: npm install -g fern-api + + - name: Validate Fern config + run: fern check + working-directory: docs-fern + + - name: Generate docs (PR preview) + if: github.event_name == 'pull_request' + run: fern generate --docs --preview + working-directory: docs-fern + env: + FERN_TOKEN: ${{ secrets.FERN_TOKEN }} + + - name: Deploy docs (production) + if: github.ref == 'refs/heads/main' + run: fern generate --docs + working-directory: docs-fern + env: + FERN_TOKEN: ${{ secrets.FERN_TOKEN }} +``` + +### Required Secrets + +| Secret | Description | +|--------|-------------| +| `FERN_TOKEN` | API token from Fern dashboard for deployments | + +### Local Development + +```bash +# Install Fern CLI +npm install -g fern-api + +# Navigate to docs directory +cd docs-fern + +# Validate configuration +fern check + +# Local preview (starts dev server) +fern docs dev + +# Generate static output +fern generate --docs +``` + +--- + +## Deprecation Timeline + +### Week 1-2: Parallel Operation + +- `docs-fern/` is the primary documentation source +- `docs/` remains for reference and rollback capability +- Both directories exist in repository +- MkDocs config (`mkdocs.yml`) remains but is not used in CI + +### Week 3: Soft Deprecation + +- Remove MkDocs from CI/CD pipeline +- Add deprecation notice to `docs/README.md`: + ```markdown + > ⚠️ **DEPRECATED**: This directory is no longer maintained. + > Documentation has moved to `docs-fern/`. + > This directory will be removed on [DATE]. + ``` +- Update `CONTRIBUTING.md` to reference new docs location + +### Week 4: Hard Deprecation + +- Delete `docs/` directory +- Delete `mkdocs.yml` +- Remove MkDocs dependencies from `pyproject.toml`: + - `mkdocs` + - `mkdocs-material` + - `mkdocs-jupyter` + - `mkdocstrings` + - `mkdocstrings-python` +- Update `.gitignore` to remove MkDocs artifacts (`site/`) +- Archive final MkDocs state in git tag: `mkdocs-final` + +### Post-Migration Cleanup + +- Remove custom CSS (`docs/css/`) +- Remove custom JS (`docs/js/`) +- Remove template overrides (`docs/overrides/`) +- Update README.md documentation links + +--- + +## Risks and Mitigations + +| Risk | Impact | Likelihood | Mitigation | Owner | +|------|--------|------------|------------|-------| +| API reference quality loss | High | Medium | Document Python APIs manually with curated examples; add to PR checklist | [Owner] | +| Notebook interactivity loss | Medium | Low | Link to Colab badges at top of each tutorial; keep `.ipynb` files hosted | [Owner] | +| Icon support gaps | Low | High | Replace `:material-xxx:` with emoji or text labels; document in style guide | [Owner] | +| Custom CSS incompatibility | Low | Medium | Use Fern's built-in components; minimal custom CSS only if essential | [Owner] | +| Build/deploy workflow breaks | Medium | Medium | Test CI/CD in separate branch before merging; keep MkDocs as fallback for 2 weeks | [Owner] | +| SEO ranking drop | Medium | Medium | Implement all redirects before deprecating old URLs; submit sitemap to search engines | [Owner] | +| Broken links post-migration | Medium | High | Run automated link checker before go-live; fix all broken links | [Owner] | + +--- + +## Common Pitfalls & Troubleshooting + +### Pitfall 1: Nested Admonitions + +MkDocs supports nested admonitions; Fern callouts do not nest well. + +**Problem:** +```markdown +!!! note + Some text + !!! warning + Nested warning +``` + +**Solution:** Flatten to sequential callouts: +```mdx + +Some text + + + +Nested warning (now separate) + +``` + +### Pitfall 2: Code Blocks Inside Tabs + +Indentation is critical. Fern expects proper nesting. + +**Problem (incorrect indentation):** +```mdx + + +```python +code +``` + + +``` + +**Solution (correct indentation):** +```mdx + + + ```python + code + ``` + + +``` + +### Pitfall 3: MkDocs-Specific Syntax + +These MkDocs features have no direct Fern equivalent: + +| MkDocs Syntax | Action | +|---------------|--------| +| `{ .md-button }` | Remove, use standard links | +| `{ .annotate }` | Remove, use inline notes | +| `[TOC]` | Remove, Fern auto-generates TOC | +| `--8<-- "file"` | Inline the code or use `` | +| `::: module.path` | Convert to manual documentation | + +### Pitfall 4: Image Paths + +MkDocs resolves images relative to the markdown file; Fern resolves from project root. + +**MkDocs:** +```markdown +![Alt](../assets/image.png) +``` + +**Fern:** +```mdx +![Alt](/assets/image.png) +``` + +### Pitfall 5: Front Matter + +Fern uses YAML front matter for page metadata. Add to each file: + +```mdx +--- +title: Page Title +description: Optional description for SEO +--- +``` + +### Troubleshooting Commands + +```bash +# Validate all Fern configuration +fern check + +# See detailed errors +fern check --log-level debug + +# Preview locally before deploying +fern docs dev + +# Check for broken internal links +grep -r '](/[^)]*\.mdx)' docs-fern/pages/ | grep -v '^#' +``` + +--- + +## Rollback Plan + +If critical issues are discovered post-migration, follow this rollback procedure: + +### Trigger Conditions + +Initiate rollback if any of these occur within 2 weeks of go-live: + +- [ ] >10% of pages have rendering issues +- [ ] Search functionality broken +- [ ] CI/CD pipeline repeatedly failing +- [ ] Critical content missing or incorrect +- [ ] Stakeholder requests rollback + +### Rollback Steps + +**Step 1: Restore MkDocs CI/CD (15 minutes)** + +```yaml +# Revert .github/workflows/docs.yml to MkDocs version +git revert +git push origin main +``` + +**Step 2: Restore DNS/Hosting (if changed)** + +Point documentation URL back to MkDocs deployment location. + +**Step 3: Communicate** + +Notify team: +> Documentation rollback initiated due to [REASON]. +> MkDocs docs restored at [URL]. +> Fern migration paused pending [ISSUE] resolution. + +**Step 4: Preserve Fern Work** + +```bash +# Don't delete - branch and preserve +git checkout -b fern-migration-paused +git push origin fern-migration-paused +``` + +**Step 5: Post-Mortem** + +Document: +- What triggered the rollback +- Root cause analysis +- Required fixes before retry +- Updated timeline + +### Rollback Window + +- **Weeks 1-2**: Full rollback capability (MkDocs still in repo) +- **Week 3+**: Rollback requires restoring from `mkdocs-final` tag +- **Week 4+**: Rollback requires significant effort (MkDocs deleted) + +--- + +## Pre-Flight Checklist + +Before starting migration, ensure: + +- [ ] Fern account created and `FERN_TOKEN` obtained +- [ ] Hosting decision finalized (Section: Decisions #4) +- [ ] Timeline approved and dates filled in (Section: Decisions #5) +- [ ] Owner assigned in RFC header +- [ ] Team notified of upcoming changes +- [ ] Current docs snapshot archived (`git tag mkdocs-snapshot-pre-migration`) + +--- + +## Conversion Checklist + +### File-by-File Migration Tracker + +Use this checklist during Phase 2-4 to track progress: + +#### Getting Started +- [ ] `index.md` β†’ `pages/index.mdx` +- [ ] `installation.md` β†’ `pages/installation.mdx` +- [ ] `quick-start.md` β†’ `pages/quick-start.mdx` +- [ ] `CONTRIBUTING.md` β†’ `pages/contributing.mdx` + +#### Concepts - Models +- [ ] `concepts/models/default-model-settings.md` β†’ `pages/concepts/models/default-model-settings.mdx` +- [ ] `concepts/models/custom-model-settings.md` β†’ `pages/concepts/models/custom-model-settings.mdx` +- [ ] `concepts/models/configure-model-settings-with-the-cli.md` β†’ `pages/concepts/models/configure-with-cli.mdx` +- [ ] `concepts/models/model-providers.md` β†’ `pages/concepts/models/model-providers.mdx` +- [ ] `concepts/models/model-configs.md` β†’ `pages/concepts/models/model-configs.mdx` +- [ ] `concepts/models/inference-parameters.md` β†’ `pages/concepts/models/inference-parameters.mdx` + +#### Concepts - Other +- [ ] `concepts/columns.md` β†’ `pages/concepts/columns.mdx` +- [ ] `concepts/validators.md` β†’ `pages/concepts/validators.mdx` +- [ ] `concepts/processors.md` β†’ `pages/concepts/processors.mdx` +- [ ] `concepts/person_sampling.md` β†’ `pages/concepts/person-sampling.mdx` + +#### Tutorials (Notebook Conversion) +- [ ] `colab_notebooks/1-the-basics.ipynb` β†’ `pages/tutorials/the-basics.mdx` +- [ ] `colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb` β†’ `pages/tutorials/structured-outputs.mdx` +- [ ] `colab_notebooks/3-seeding-with-a-dataset.ipynb` β†’ `pages/tutorials/seeding-with-dataset.mdx` +- [ ] `colab_notebooks/4-providing-images-as-context.ipynb` β†’ `pages/tutorials/images-as-context.mdx` +- [ ] Create `pages/tutorials/overview.mdx` (new index page) + +#### Recipes +- [ ] `recipes/cards.md` β†’ `pages/recipes/index.mdx` +- [ ] `recipes/code_generation/text_to_python.md` β†’ `pages/recipes/code-generation/text-to-python.mdx` +- [ ] `recipes/code_generation/text_to_sql.md` β†’ `pages/recipes/code-generation/text-to-sql.mdx` +- [ ] `recipes/qa_and_chat/product_info_qa.md` β†’ `pages/recipes/qa-and-chat/product-info-qa.mdx` +- [ ] `recipes/qa_and_chat/multi_turn_chat.md` β†’ `pages/recipes/qa-and-chat/multi-turn-chat.mdx` + +#### Plugins +- [ ] `plugins/overview.md` β†’ `pages/plugins/overview.mdx` +- [ ] `plugins/example.md` β†’ `pages/plugins/example.mdx` +- [ ] `plugins/available.md` β†’ `pages/plugins/available.mdx` + +#### API Reference +- [ ] `code_reference/models.md` β†’ `pages/api-reference/models.mdx` +- [ ] `code_reference/column_configs.md` β†’ `pages/api-reference/column-configs.mdx` +- [ ] `code_reference/config_builder.md` β†’ `pages/api-reference/config-builder.mdx` +- [ ] `code_reference/data_designer_config.md` β†’ `pages/api-reference/data-designer-config.mdx` +- [ ] `code_reference/sampler_params.md` β†’ `pages/api-reference/sampler-params.mdx` +- [ ] `code_reference/validator_params.md` β†’ `pages/api-reference/validator-params.mdx` +- [ ] `code_reference/processors.md` β†’ `pages/api-reference/processors.mdx` +- [ ] `code_reference/analysis.md` β†’ `pages/api-reference/analysis.mdx` + +#### Assets +- [ ] Copy `assets/palette-favicon.png` β†’ `assets/favicon.png` +- [ ] Copy `assets/recipes/` β†’ `assets/recipes/` + +--- + +## Success Criteria + +- [ ] All existing documentation pages migrated (32 pages total) +- [ ] Navigation structure preserved +- [ ] All code examples render correctly +- [ ] All internal links functional (automated check) +- [ ] All external links functional (automated check) +- [ ] NVIDIA branding applied (green accent: #76B900) +- [ ] Local development workflow documented +- [ ] CI/CD pipeline deployed and tested +- [ ] URL redirects configured and tested +- [ ] PR preview deployments working +- [ ] Page load time < 3 seconds + +--- + +## Decisions + +The following decisions have been made to ensure smooth execution: + +### 1. API Reference Approach + +**Decision:** Manual documentation with code examples (Option A) + +**Rationale:** +- Fastest path to migration completion +- Allows curated examples rather than raw docstring dumps +- Fern's Python SDK autodoc is not mature enough for our needs + +**Maintenance commitment:** +- API reference pages will be updated alongside code changes +- Add to PR checklist: "Update API docs if public interfaces changed" +- Revisit automation options in Q2 2026 + +### 2. Notebook Handling + +**Decision:** Convert to MDX with Colab links + +**Implementation:** +- Extract code cells as fenced code blocks +- Convert markdown cells directly to MDX +- Preserve Colab badge at top of each tutorial +- Link to hosted `.ipynb` files for interactive experience + +**Example header for converted notebooks:** +```mdx +--- +title: The Basics +--- + + +Run this tutorial interactively in [Google Colab](https://colab.research.google.com/github/NVIDIA-NeMo/DataDesigner/blob/main/docs/colab_notebooks/1-the-basics.ipynb). + +``` + +### 3. Versioning + +**Decision:** Single version initially, evaluate multi-version post-launch + +**Rationale:** +- Current MkDocs setup is single-version +- No immediate need for versioned docs +- Fern supports versioning when needed + +### 4. Hosting + +**Decision:** [Fern-hosted | Self-hosted] _(fill in)_ + +**If Fern-hosted:** +- URL: `https://datadesigner.docs.buildwithfern.com` or custom domain +- Zero infrastructure management +- Built-in CDN and SSL + +**If self-hosted:** +- Deploy to existing NVIDIA infrastructure +- Use `fern generate --docs` to produce static output +- Configure redirects on hosting platform + +### 5. Timeline + +**Decision:** [X weeks] from RFC approval + +| Milestone | Target Date | +|-----------|-------------| +| Phase 1 (Setup) complete | [DATE] | +| Phase 2-3 (Content migration) complete | [DATE] | +| Phase 4 (API reference) complete | [DATE] | +| Phase 5-6 (Polish & testing) complete | [DATE] | +| Go-live | [DATE] | +| Old docs deprecated | [DATE + 2 weeks] | + +--- + +## Helper Scripts + +The following scripts can assist with automated conversion: + +### 1. Admonition Converter + +```python +#!/usr/bin/env python3 +"""Convert MkDocs admonitions to Fern callouts.""" +import re +import sys + +ADMONITION_MAP = { + "note": "Note", + "tip": "Tip", + "info": "Info", + "warning": "Warning", + "danger": "Warning", + "question": "Info", + "example": "Info", + "abstract": "Note", + "success": "Tip", + "failure": "Warning", + "bug": "Warning", +} + +def convert_admonitions(content: str) -> str: + """Convert !!! admonitions to components.""" + pattern = r'!!! (\w+)(?: "([^"]*)")?\n((?: .*\n?)*)' + + def replace(match: re.Match) -> str: + admon_type = match.group(1).lower() + title = match.group(2) or "" + body = match.group(3) + # Remove 4-space indent from body + body = re.sub(r'^ ', '', body, flags=re.MULTILINE).strip() + fern_type = ADMONITION_MAP.get(admon_type, "Note") + if title: + return f'<{fern_type} title="{title}">\n{body}\n\n' + return f'<{fern_type}>\n{body}\n\n' + + return re.sub(pattern, replace, content) + +if __name__ == "__main__": + content = sys.stdin.read() + print(convert_admonitions(content)) +``` + +**Usage:** +```bash +cat docs/concepts/columns.md | python scripts/convert_admonitions.py > docs-fern/pages/concepts/columns.mdx +``` + +### 2. Tabs Converter + +```python +#!/usr/bin/env python3 +"""Convert MkDocs tabs to Fern Tabs components.""" +import re +import sys + +def convert_tabs(content: str) -> str: + """Convert === tabs to components.""" + # Match tab groups + pattern = r'((?:=== "([^"]+)"\n((?: .*\n?)*)\n?)+)' + + def replace_group(match: re.Match) -> str: + group = match.group(0) + tabs = re.findall(r'=== "([^"]+)"\n((?: .*\n?)*)', group) + result = [""] + for title, body in tabs: + body = re.sub(r'^ ', '', body, flags=re.MULTILINE).strip() + result.append(f' ') + result.append(f' {body}') + result.append(' ') + result.append("") + return '\n'.join(result) + '\n' + + return re.sub(pattern, replace_group, content) + +if __name__ == "__main__": + content = sys.stdin.read() + print(convert_tabs(content)) +``` + +### 3. Notebook to MDX Converter + +```python +#!/usr/bin/env python3 +"""Convert Jupyter notebook to MDX.""" +import json +import sys +from pathlib import Path + +def notebook_to_mdx(notebook_path: str, colab_url: str) -> str: + """Convert a Jupyter notebook to MDX format.""" + with open(notebook_path) as f: + nb = json.load(f) + + lines = [ + "---", + f"title: {Path(notebook_path).stem.replace('-', ' ').title()}", + "---", + "", + '', + f"Run this tutorial interactively in [Google Colab]({colab_url}).", + "", + "", + ] + + for cell in nb.get("cells", []): + cell_type = cell.get("cell_type") + source = "".join(cell.get("source", [])) + + if cell_type == "markdown": + lines.append(source) + lines.append("") + elif cell_type == "code": + lines.append("```python") + lines.append(source) + lines.append("```") + lines.append("") + + return "\n".join(lines) + +if __name__ == "__main__": + if len(sys.argv) != 3: + print("Usage: notebook_to_mdx.py ") + sys.exit(1) + print(notebook_to_mdx(sys.argv[1], sys.argv[2])) +``` + +**Usage:** +```bash +python scripts/notebook_to_mdx.py \ + docs/colab_notebooks/1-the-basics.ipynb \ + "https://colab.research.google.com/github/NVIDIA-NeMo/DataDesigner/blob/main/docs/colab_notebooks/1-the-basics.ipynb" \ + > docs-fern/pages/tutorials/the-basics.mdx +``` + +### 4. Link Checker + +```bash +#!/bin/bash +# Check all links in Fern docs +cd docs-fern + +# Internal links +grep -roh '\[.*\]([^)]*\.mdx)' pages/ | sort | uniq + +# External links +grep -roh 'https://[^)]*' pages/ | sort | uniq | while read url; do + if ! curl -s --head "$url" | head -1 | grep -q "200\|301\|302"; then + echo "BROKEN: $url" + fi +done +``` + +### 5. Batch Conversion Script + +```bash +#!/bin/bash +# batch_convert.sh - Run all conversions + +set -e + +SCRIPTS_DIR="scripts" +DOCS_DIR="docs" +FERN_DIR="docs-fern/pages" + +# Create directory structure +mkdir -p "$FERN_DIR"/{concepts/models,tutorials,recipes/{code-generation,qa-and-chat},plugins,api-reference} + +# Convert simple pages (admonitions + tabs) +for file in index installation quick-start CONTRIBUTING; do + src="$DOCS_DIR/$file.md" + if [ -f "$src" ]; then + dst="$FERN_DIR/${file,,}.mdx" + cat "$src" | python "$SCRIPTS_DIR/convert_admonitions.py" | python "$SCRIPTS_DIR/convert_tabs.py" > "$dst" + echo "Converted: $src -> $dst" + fi +done + +echo "Batch conversion complete. Manual review required." +``` + +--- + +## References + +- [Fern Docs Getting Started](https://buildwithfern.com/learn/docs/getting-started/overview) +- [Fern Components Overview](https://buildwithfern.com/learn/docs/writing-content/components/overview) +- [Fern Configuration](https://buildwithfern.com/learn/docs/configuration/site-level-settings) +- [Current MkDocs Configuration](../mkdocs.yml) + +--- + +## Appendix: Sample Migration + +### Before (MkDocs - columns.md excerpt) + +```markdown +# Columns + +Columns are the fundamental building blocks in Data Designer. + +!!! note "The Declarative Approach" + Columns are **declarative specifications**. You describe *what* you want... + +## Column Types + +### 🎲 Sampler Columns + +Sampler columns generate data using numerical sampling... + +!!! tip "Conditional Sampling" + Samplers support **conditional parameters**... +``` + +### After (Fern - columns.mdx excerpt) + +```mdx +# Columns + +Columns are the fundamental building blocks in Data Designer. + + +Columns are **declarative specifications**. You describe *what* you want... + + +## Column Types + +### 🎲 Sampler Columns + +Sampler columns generate data using numerical sampling... + + +Samplers support **conditional parameters**... + +``` diff --git a/docs/colab_notebooks/1-the-basics.ipynb b/docs/colab_notebooks/1-the-basics.ipynb index 9a2456e69..66c194387 100644 --- a/docs/colab_notebooks/1-the-basics.ipynb +++ b/docs/colab_notebooks/1-the-basics.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "00c21026", + "id": "34304cc0", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: The Basics\n", @@ -14,7 +14,7 @@ }, { "cell_type": "markdown", - "id": "ece3d9a9", + "id": "d407c70a", "metadata": {}, "source": [ "### πŸ“¦ Import Data Designer\n", @@ -26,7 +26,7 @@ }, { "cell_type": "markdown", - "id": "38d1b88f", + "id": "10a53966", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", @@ -37,7 +37,7 @@ { "cell_type": "code", "execution_count": null, - "id": "53321634", + "id": "03518092", "metadata": {}, "outputs": [], "source": [ @@ -48,7 +48,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5e8544d6", + "id": "9eb01405", "metadata": {}, "outputs": [], "source": [ @@ -66,7 +66,7 @@ { "cell_type": "code", "execution_count": null, - "id": "4a9e48bc", + "id": "0620b86f", "metadata": {}, "outputs": [], "source": [ @@ -76,7 +76,7 @@ }, { "cell_type": "markdown", - "id": "21b12719", + "id": "c56a11f5", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", @@ -89,7 +89,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7d689c22", + "id": "4c75c742", "metadata": {}, "outputs": [], "source": [ @@ -98,7 +98,7 @@ }, { "cell_type": "markdown", - "id": "3db3eab3", + "id": "ed587c27", "metadata": {}, "source": [ "### πŸŽ›οΈ Define model configurations\n", @@ -115,7 +115,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f4447bbe", + "id": "d276fc65", "metadata": {}, "outputs": [], "source": [ @@ -145,7 +145,7 @@ }, { "cell_type": "markdown", - "id": "b5af9991", + "id": "ad7ee6a4", "metadata": {}, "source": [ "### πŸ—οΈ Initialize the Data Designer Config Builder\n", @@ -160,7 +160,7 @@ { "cell_type": "code", "execution_count": null, - "id": "40bdb697", + "id": "4f11a79c", "metadata": {}, "outputs": [], "source": [ @@ -169,7 +169,7 @@ }, { "cell_type": "markdown", - "id": "4dad8aa0", + "id": "37fec12e", "metadata": {}, "source": [ "## 🎲 Getting started with sampler columns\n", @@ -186,7 +186,7 @@ { "cell_type": "code", "execution_count": null, - "id": "8eecf6e8", + "id": "5503a564", "metadata": {}, "outputs": [], "source": [ @@ -195,7 +195,7 @@ }, { "cell_type": "markdown", - "id": "e4d6a23a", + "id": "fb1b7bd8", "metadata": {}, "source": [ "Let's start designing our product review dataset by adding product category and subcategory columns.\n" @@ -204,7 +204,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c3ce7276", + "id": "fa0a52fe", "metadata": {}, "outputs": [], "source": [ @@ -285,7 +285,7 @@ }, { "cell_type": "markdown", - "id": "a8aafd2c", + "id": "33d10aad", "metadata": {}, "source": [ "Next, let's add samplers to generate data related to the customer and their review.\n" @@ -294,7 +294,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3bdb3991", + "id": "08b43716", "metadata": {}, "outputs": [], "source": [ @@ -331,7 +331,7 @@ }, { "cell_type": "markdown", - "id": "743bb645", + "id": "647e6aac", "metadata": {}, "source": [ "## 🦜 LLM-generated columns\n", @@ -346,7 +346,7 @@ { "cell_type": "code", "execution_count": null, - "id": "da2b9677", + "id": "c9062ca9", "metadata": {}, "outputs": [], "source": [ @@ -382,7 +382,7 @@ }, { "cell_type": "markdown", - "id": "febed040", + "id": "e2a1a3ba", "metadata": {}, "source": [ "### πŸ” Iteration is key – preview the dataset!\n", @@ -399,7 +399,7 @@ { "cell_type": "code", "execution_count": null, - "id": "af574e1c", + "id": "4f47448c", "metadata": {}, "outputs": [], "source": [ @@ -409,7 +409,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c5cddea8", + "id": "5562058b", "metadata": {}, "outputs": [], "source": [ @@ -420,7 +420,7 @@ { "cell_type": "code", "execution_count": null, - "id": "523da02f", + "id": "60c93917", "metadata": {}, "outputs": [], "source": [ @@ -430,7 +430,7 @@ }, { "cell_type": "markdown", - "id": "b58b6a23", + "id": "93ed2108", "metadata": {}, "source": [ "### πŸ“Š Analyze the generated data\n", @@ -443,7 +443,7 @@ { "cell_type": "code", "execution_count": null, - "id": "26b9a54a", + "id": "4e385e37", "metadata": {}, "outputs": [], "source": [ @@ -453,7 +453,7 @@ }, { "cell_type": "markdown", - "id": "ae2f9efe", + "id": "e246e323", "metadata": {}, "source": [ "### πŸ†™ Scale up!\n", @@ -466,7 +466,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d8341c24", + "id": "40f65835", "metadata": {}, "outputs": [], "source": [ @@ -476,7 +476,7 @@ { "cell_type": "code", "execution_count": null, - "id": "746166bb", + "id": "ee916774", "metadata": {}, "outputs": [], "source": [ @@ -489,7 +489,7 @@ { "cell_type": "code", "execution_count": null, - "id": "4c67992b", + "id": "009f7e61", "metadata": {}, "outputs": [], "source": [ @@ -501,7 +501,7 @@ }, { "cell_type": "markdown", - "id": "65da8b83", + "id": "9bb500b1", "metadata": {}, "source": [ "## ⏭️ Next Steps\n", diff --git a/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb b/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb index 75e2d72db..bd1d498a1 100644 --- a/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb +++ b/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "3d5ec9c5", + "id": "5b57e503", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: Structured Outputs and Jinja Expressions\n", @@ -16,7 +16,7 @@ }, { "cell_type": "markdown", - "id": "3813ccb2", + "id": "d9ae6b33", "metadata": {}, "source": [ "### πŸ“¦ Import Data Designer\n", @@ -28,7 +28,7 @@ }, { "cell_type": "markdown", - "id": "86173a51", + "id": "991aea10", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", @@ -39,7 +39,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6ee5a0e0", + "id": "1674de34", "metadata": {}, "outputs": [], "source": [ @@ -50,7 +50,7 @@ { "cell_type": "code", "execution_count": null, - "id": "87742e65", + "id": "fb9fc94f", "metadata": {}, "outputs": [], "source": [ @@ -68,7 +68,7 @@ { "cell_type": "code", "execution_count": null, - "id": "450a862c", + "id": "5c3a6750", "metadata": {}, "outputs": [], "source": [ @@ -78,7 +78,7 @@ }, { "cell_type": "markdown", - "id": "8f06cd05", + "id": "505190dd", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", @@ -91,7 +91,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9a880c00", + "id": "f2a2ce1e", "metadata": {}, "outputs": [], "source": [ @@ -100,7 +100,7 @@ }, { "cell_type": "markdown", - "id": "d862ae5c", + "id": "a27febdd", "metadata": {}, "source": [ "### πŸŽ›οΈ Define model configurations\n", @@ -117,7 +117,7 @@ { "cell_type": "code", "execution_count": null, - "id": "84e6f76a", + "id": "73cb0bf0", "metadata": {}, "outputs": [], "source": [ @@ -147,7 +147,7 @@ }, { "cell_type": "markdown", - "id": "07b038aa", + "id": "983889d2", "metadata": {}, "source": [ "### πŸ—οΈ Initialize the Data Designer Config Builder\n", @@ -162,7 +162,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b7e42df4", + "id": "00e44ceb", "metadata": {}, "outputs": [], "source": [ @@ -171,7 +171,7 @@ }, { "cell_type": "markdown", - "id": "600127e0", + "id": "63dfc1e6", "metadata": {}, "source": [ "### πŸ§‘β€πŸŽ¨ Designing our data\n", @@ -198,7 +198,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ecebc077", + "id": "c2d1a831", "metadata": {}, "outputs": [], "source": [ @@ -226,7 +226,7 @@ }, { "cell_type": "markdown", - "id": "6f24c511", + "id": "15155d07", "metadata": {}, "source": [ "Next, let's design our product review dataset using a few more tricks compared to the previous notebook.\n" @@ -235,7 +235,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6cd4a4a5", + "id": "dc2d88b9", "metadata": {}, "outputs": [], "source": [ @@ -344,7 +344,7 @@ }, { "cell_type": "markdown", - "id": "3fa250c7", + "id": "4ecbd214", "metadata": {}, "source": [ "Next, we will use more advanced Jinja expressions to create new columns.\n", @@ -361,7 +361,7 @@ { "cell_type": "code", "execution_count": null, - "id": "77895d82", + "id": "4d7a8993", "metadata": {}, "outputs": [], "source": [ @@ -414,7 +414,7 @@ }, { "cell_type": "markdown", - "id": "236f32c0", + "id": "f38b1b81", "metadata": {}, "source": [ "### πŸ” Iteration is key – preview the dataset!\n", @@ -431,7 +431,7 @@ { "cell_type": "code", "execution_count": null, - "id": "719d3d7f", + "id": "b89146e9", "metadata": {}, "outputs": [], "source": [ @@ -441,7 +441,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d25b2a23", + "id": "645d6c9f", "metadata": {}, "outputs": [], "source": [ @@ -452,7 +452,7 @@ { "cell_type": "code", "execution_count": null, - "id": "8cfff7c2", + "id": "e3b65a4d", "metadata": {}, "outputs": [], "source": [ @@ -462,7 +462,7 @@ }, { "cell_type": "markdown", - "id": "acfc4317", + "id": "586eaf8f", "metadata": {}, "source": [ "### πŸ“Š Analyze the generated data\n", @@ -475,7 +475,7 @@ { "cell_type": "code", "execution_count": null, - "id": "02a90c0a", + "id": "b27be3ef", "metadata": {}, "outputs": [], "source": [ @@ -485,7 +485,7 @@ }, { "cell_type": "markdown", - "id": "60bac583", + "id": "a91c5a4d", "metadata": {}, "source": [ "### πŸ†™ Scale up!\n", @@ -498,7 +498,7 @@ { "cell_type": "code", "execution_count": null, - "id": "fd92ca3c", + "id": "5aa63151", "metadata": {}, "outputs": [], "source": [ @@ -508,7 +508,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ca5eded6", + "id": "769fd142", "metadata": {}, "outputs": [], "source": [ @@ -521,7 +521,7 @@ { "cell_type": "code", "execution_count": null, - "id": "29f4b884", + "id": "837585fd", "metadata": {}, "outputs": [], "source": [ @@ -533,7 +533,7 @@ }, { "cell_type": "markdown", - "id": "18914be2", + "id": "2a34c7fa", "metadata": {}, "source": [ "## ⏭️ Next Steps\n", diff --git a/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb b/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb index 91c13986d..c1853c2ae 100644 --- a/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb +++ b/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "30b0205f", + "id": "1d1e0d32", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: Seeding Synthetic Data Generation with an External Dataset\n", @@ -16,7 +16,7 @@ }, { "cell_type": "markdown", - "id": "fd7184e7", + "id": "59a2287f", "metadata": {}, "source": [ "### πŸ“¦ Import Data Designer\n", @@ -28,7 +28,7 @@ }, { "cell_type": "markdown", - "id": "f229a5f3", + "id": "37598099", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", @@ -39,7 +39,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3cfdeadf", + "id": "e21fdec6", "metadata": {}, "outputs": [], "source": [ @@ -50,7 +50,7 @@ { "cell_type": "code", "execution_count": null, - "id": "8ad3bee9", + "id": "6f0f7a06", "metadata": {}, "outputs": [], "source": [ @@ -68,7 +68,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b7a8d675", + "id": "074125e6", "metadata": {}, "outputs": [], "source": [ @@ -78,7 +78,7 @@ }, { "cell_type": "markdown", - "id": "e52b2806", + "id": "6a0fa5cd", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", @@ -91,7 +91,7 @@ { "cell_type": "code", "execution_count": null, - "id": "21ad21d1", + "id": "025133a6", "metadata": {}, "outputs": [], "source": [ @@ -100,7 +100,7 @@ }, { "cell_type": "markdown", - "id": "e313e1c7", + "id": "a5acc512", "metadata": {}, "source": [ "### πŸŽ›οΈ Define model configurations\n", @@ -117,7 +117,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5927e232", + "id": "c045292e", "metadata": {}, "outputs": [], "source": [ @@ -147,7 +147,7 @@ }, { "cell_type": "markdown", - "id": "3fe284f0", + "id": "9ecf5d66", "metadata": {}, "source": [ "### πŸ—οΈ Initialize the Data Designer Config Builder\n", @@ -162,7 +162,7 @@ { "cell_type": "code", "execution_count": null, - "id": "0475564b", + "id": "c83f40e8", "metadata": {}, "outputs": [], "source": [ @@ -171,7 +171,7 @@ }, { "cell_type": "markdown", - "id": "588837c2", + "id": "736239fb", "metadata": {}, "source": [ "## πŸ₯ Prepare a seed dataset\n", @@ -196,7 +196,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e8dfb164", + "id": "37d3c904", "metadata": {}, "outputs": [], "source": [ @@ -214,7 +214,7 @@ }, { "cell_type": "markdown", - "id": "ca5f46ea", + "id": "775fb307", "metadata": {}, "source": [ "## 🎨 Designing our synthetic patient notes dataset\n", @@ -227,7 +227,7 @@ { "cell_type": "code", "execution_count": null, - "id": "830810e8", + "id": "ac5dd110", "metadata": {}, "outputs": [], "source": [ @@ -308,7 +308,7 @@ }, { "cell_type": "markdown", - "id": "cbb1e2ad", + "id": "34e4310f", "metadata": {}, "source": [ "### πŸ” Iteration is key – preview the dataset!\n", @@ -325,7 +325,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f9c39104", + "id": "227e584e", "metadata": {}, "outputs": [], "source": [ @@ -335,7 +335,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5750e220", + "id": "cddf1ec9", "metadata": {}, "outputs": [], "source": [ @@ -346,7 +346,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b3573753", + "id": "9a12e8b5", "metadata": {}, "outputs": [], "source": [ @@ -356,7 +356,7 @@ }, { "cell_type": "markdown", - "id": "14937896", + "id": "97aa4334", "metadata": {}, "source": [ "### πŸ“Š Analyze the generated data\n", @@ -369,7 +369,7 @@ { "cell_type": "code", "execution_count": null, - "id": "cd3adb37", + "id": "11a1bb87", "metadata": {}, "outputs": [], "source": [ @@ -379,7 +379,7 @@ }, { "cell_type": "markdown", - "id": "aa4fee79", + "id": "857ca634", "metadata": {}, "source": [ "### πŸ†™ Scale up!\n", @@ -392,7 +392,7 @@ { "cell_type": "code", "execution_count": null, - "id": "29024ffc", + "id": "42fc4f63", "metadata": {}, "outputs": [], "source": [ @@ -402,7 +402,7 @@ { "cell_type": "code", "execution_count": null, - "id": "73da6149", + "id": "a329c7e9", "metadata": {}, "outputs": [], "source": [ @@ -415,7 +415,7 @@ { "cell_type": "code", "execution_count": null, - "id": "bc2f927d", + "id": "c9fe3e2f", "metadata": {}, "outputs": [], "source": [ @@ -427,7 +427,7 @@ }, { "cell_type": "markdown", - "id": "29990c5d", + "id": "9ea737dd", "metadata": {}, "source": [ "## ⏭️ Next Steps\n", diff --git a/docs/colab_notebooks/4-providing-images-as-context.ipynb b/docs/colab_notebooks/4-providing-images-as-context.ipynb index cc10ec63d..1de070dda 100644 --- a/docs/colab_notebooks/4-providing-images-as-context.ipynb +++ b/docs/colab_notebooks/4-providing-images-as-context.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "911877e5", + "id": "92c252fe", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: Providing Images as Context for Vision-Based Data Generation" @@ -10,7 +10,7 @@ }, { "cell_type": "markdown", - "id": "c6756afd", + "id": "6dddba82", "metadata": {}, "source": [ "#### πŸ“š What you'll learn\n", @@ -25,7 +25,7 @@ }, { "cell_type": "markdown", - "id": "d73b25ce", + "id": "c676c7f6", "metadata": {}, "source": [ "### πŸ“¦ Import Data Designer\n", @@ -37,7 +37,7 @@ }, { "cell_type": "markdown", - "id": "f05ece3e", + "id": "ea9c162d", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", @@ -48,7 +48,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d84f4489", + "id": "9003e48c", "metadata": {}, "outputs": [], "source": [ @@ -59,7 +59,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5e4cc2d4", + "id": "b84124e6", "metadata": {}, "outputs": [], "source": [ @@ -77,7 +77,7 @@ { "cell_type": "code", "execution_count": null, - "id": "4e4e8d45", + "id": "623b014f", "metadata": {}, "outputs": [], "source": [ @@ -100,7 +100,7 @@ }, { "cell_type": "markdown", - "id": "0cdd2a8a", + "id": "4fff6c6d", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", @@ -113,7 +113,7 @@ { "cell_type": "code", "execution_count": null, - "id": "4bb0ca16", + "id": "fab8e68a", "metadata": {}, "outputs": [], "source": [ @@ -122,7 +122,7 @@ }, { "cell_type": "markdown", - "id": "bd17820d", + "id": "b95f0f43", "metadata": {}, "source": [ "### πŸŽ›οΈ Define model configurations\n", @@ -139,7 +139,7 @@ { "cell_type": "code", "execution_count": null, - "id": "301f2bd2", + "id": "c942693b", "metadata": {}, "outputs": [], "source": [ @@ -162,7 +162,7 @@ }, { "cell_type": "markdown", - "id": "ad04f82a", + "id": "295a9be1", "metadata": {}, "source": [ "### πŸ—οΈ Initialize the Data Designer Config Builder\n", @@ -177,7 +177,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ac8e2885", + "id": "d3895093", "metadata": {}, "outputs": [], "source": [ @@ -186,7 +186,7 @@ }, { "cell_type": "markdown", - "id": "7b8aafc0", + "id": "18c1c8c0", "metadata": {}, "source": [ "### 🌱 Seed Dataset Creation\n", @@ -203,7 +203,7 @@ { "cell_type": "code", "execution_count": null, - "id": "432edd4a", + "id": "4ff899e2", "metadata": {}, "outputs": [], "source": [ @@ -218,7 +218,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c4f94627", + "id": "53c29682", "metadata": {}, "outputs": [], "source": [ @@ -266,7 +266,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9b697311", + "id": "6d221329", "metadata": {}, "outputs": [], "source": [ @@ -284,7 +284,7 @@ { "cell_type": "code", "execution_count": null, - "id": "bcfc97e8", + "id": "4e965420", "metadata": {}, "outputs": [], "source": [ @@ -294,7 +294,7 @@ { "cell_type": "code", "execution_count": null, - "id": "0a3bdc13", + "id": "b3f30b9e", "metadata": {}, "outputs": [], "source": [ @@ -306,7 +306,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f9665355", + "id": "3d2d08e3", "metadata": {}, "outputs": [], "source": [ @@ -335,7 +335,7 @@ }, { "cell_type": "markdown", - "id": "6d900aaa", + "id": "d420e3c5", "metadata": {}, "source": [ "### πŸ” Iteration is key – preview the dataset!\n", @@ -352,7 +352,7 @@ { "cell_type": "code", "execution_count": null, - "id": "51a80346", + "id": "653d0fed", "metadata": {}, "outputs": [], "source": [ @@ -362,7 +362,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ea217964", + "id": "8d0f2be7", "metadata": {}, "outputs": [], "source": [ @@ -373,7 +373,7 @@ { "cell_type": "code", "execution_count": null, - "id": "be0e4ef0", + "id": "f3a438ec", "metadata": {}, "outputs": [], "source": [ @@ -383,7 +383,7 @@ }, { "cell_type": "markdown", - "id": "0c75f531", + "id": "7842c013", "metadata": {}, "source": [ "### πŸ“Š Analyze the generated data\n", @@ -396,7 +396,7 @@ { "cell_type": "code", "execution_count": null, - "id": "bcbf86d1", + "id": "a5544e9d", "metadata": {}, "outputs": [], "source": [ @@ -406,7 +406,7 @@ }, { "cell_type": "markdown", - "id": "0ab35029", + "id": "a2f94c56", "metadata": {}, "source": [ "### πŸ”Ž Visual Inspection\n", @@ -417,7 +417,7 @@ { "cell_type": "code", "execution_count": null, - "id": "03314ae9", + "id": "c868dec7", "metadata": { "lines_to_next_cell": 2 }, @@ -441,7 +441,7 @@ }, { "cell_type": "markdown", - "id": "e76a3e3b", + "id": "599cc8ad", "metadata": {}, "source": [ "### πŸ†™ Scale up!\n", @@ -454,7 +454,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d16566c0", + "id": "6a155306", "metadata": {}, "outputs": [], "source": [ @@ -464,7 +464,7 @@ { "cell_type": "code", "execution_count": null, - "id": "8e7796ba", + "id": "44a29388", "metadata": {}, "outputs": [], "source": [ @@ -477,7 +477,7 @@ { "cell_type": "code", "execution_count": null, - "id": "14bc1042", + "id": "c31562b7", "metadata": {}, "outputs": [], "source": [ @@ -489,7 +489,7 @@ }, { "cell_type": "markdown", - "id": "1e676330", + "id": "3d6e086d", "metadata": {}, "source": [ "## ⏭️ Next Steps\n", diff --git a/docs/colab_notebooks/5-generating-images.ipynb b/docs/colab_notebooks/5-generating-images.ipynb index ea9e0b8fa..a130f0171 100644 --- a/docs/colab_notebooks/5-generating-images.ipynb +++ b/docs/colab_notebooks/5-generating-images.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "3b8abde3", + "id": "949689cb", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: Generating Images\n", @@ -24,7 +24,7 @@ }, { "cell_type": "markdown", - "id": "1da8d75f", + "id": "4fd2547b", "metadata": {}, "source": [ "### πŸ“¦ Import Data Designer\n", @@ -35,7 +35,7 @@ }, { "cell_type": "markdown", - "id": "cc461005", + "id": "f7d4c635", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", @@ -46,7 +46,7 @@ { "cell_type": "code", "execution_count": null, - "id": "206037bf", + "id": "0d8093be", "metadata": {}, "outputs": [], "source": [ @@ -57,7 +57,7 @@ { "cell_type": "code", "execution_count": null, - "id": "db5a4929", + "id": "0d7b2670", "metadata": {}, "outputs": [], "source": [ @@ -75,7 +75,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b3cba8b6", + "id": "3a7649ef", "metadata": {}, "outputs": [], "source": [ @@ -88,7 +88,7 @@ }, { "cell_type": "markdown", - "id": "444aa9dc", + "id": "44101a2b", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", @@ -99,7 +99,7 @@ { "cell_type": "code", "execution_count": null, - "id": "1932342c", + "id": "1368d828", "metadata": {}, "outputs": [], "source": [ @@ -108,7 +108,7 @@ }, { "cell_type": "markdown", - "id": "aa7b90c5", + "id": "eb48a53b", "metadata": {}, "source": [ "### πŸŽ›οΈ Define an image-generation model\n", @@ -120,7 +120,7 @@ { "cell_type": "code", "execution_count": null, - "id": "df7e4385", + "id": "74d3635f", "metadata": {}, "outputs": [], "source": [ @@ -142,7 +142,7 @@ }, { "cell_type": "markdown", - "id": "a1325e38", + "id": "9e6f63b5", "metadata": {}, "source": [ "### πŸ—οΈ Build the config: samplers + image column\n", @@ -153,7 +153,7 @@ { "cell_type": "code", "execution_count": null, - "id": "95064ed0", + "id": "0b3f2138", "metadata": {}, "outputs": [], "source": [ @@ -326,7 +326,7 @@ }, { "cell_type": "markdown", - "id": "c6fe0620", + "id": "0a83b06e", "metadata": {}, "source": [ "### πŸ” Preview: images as base64\n", @@ -337,7 +337,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7323dce5", + "id": "36c1b7a4", "metadata": {}, "outputs": [], "source": [ @@ -347,7 +347,7 @@ { "cell_type": "code", "execution_count": null, - "id": "510b933c", + "id": "9a61fbba", "metadata": {}, "outputs": [], "source": [ @@ -358,7 +358,7 @@ { "cell_type": "code", "execution_count": null, - "id": "0c8c197f", + "id": "7ada9a40", "metadata": {}, "outputs": [], "source": [ @@ -367,7 +367,7 @@ }, { "cell_type": "markdown", - "id": "4cffd205", + "id": "08e9ac10", "metadata": {}, "source": [ "### πŸ†™ Create: images saved to disk\n", @@ -378,7 +378,7 @@ { "cell_type": "code", "execution_count": null, - "id": "308bf2b8", + "id": "3530dd98", "metadata": {}, "outputs": [], "source": [ @@ -388,7 +388,7 @@ { "cell_type": "code", "execution_count": null, - "id": "02610965", + "id": "a69079d9", "metadata": {}, "outputs": [], "source": [ @@ -399,7 +399,7 @@ { "cell_type": "code", "execution_count": null, - "id": "189af389", + "id": "362943d4", "metadata": {}, "outputs": [], "source": [ @@ -415,7 +415,7 @@ }, { "cell_type": "markdown", - "id": "51558182", + "id": "89ba40d4", "metadata": {}, "source": [ "## ⏭️ Next steps\n", diff --git a/docs/colab_notebooks/6-editing-images-with-image-context.ipynb b/docs/colab_notebooks/6-editing-images-with-image-context.ipynb index ddfe9d371..51dc6e477 100644 --- a/docs/colab_notebooks/6-editing-images-with-image-context.ipynb +++ b/docs/colab_notebooks/6-editing-images-with-image-context.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "c7129daf", + "id": "e67d9bd6", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: Image-to-Image Editing\n", @@ -25,7 +25,7 @@ }, { "cell_type": "markdown", - "id": "6a438ee3", + "id": "c8161027", "metadata": {}, "source": [ "### πŸ“¦ Import Data Designer\n", @@ -36,7 +36,7 @@ }, { "cell_type": "markdown", - "id": "1a022157", + "id": "f43f67b0", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", @@ -47,7 +47,7 @@ { "cell_type": "code", "execution_count": null, - "id": "752fe3eb", + "id": "ea1f7730", "metadata": {}, "outputs": [], "source": [ @@ -58,7 +58,7 @@ { "cell_type": "code", "execution_count": null, - "id": "49266cc2", + "id": "5ea2b3bf", "metadata": {}, "outputs": [], "source": [ @@ -76,7 +76,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d87dfa0b", + "id": "2d804455", "metadata": {}, "outputs": [], "source": [ @@ -95,7 +95,7 @@ }, { "cell_type": "markdown", - "id": "c99ff426", + "id": "4986af56", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", @@ -106,7 +106,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9be6231b", + "id": "fc1f8501", "metadata": {}, "outputs": [], "source": [ @@ -115,7 +115,7 @@ }, { "cell_type": "markdown", - "id": "3e242b51", + "id": "7efaf70f", "metadata": {}, "source": [ "### πŸŽ›οΈ Define an image-editing model\n", @@ -131,7 +131,7 @@ { "cell_type": "code", "execution_count": null, - "id": "34dd8eed", + "id": "a37fb71a", "metadata": {}, "outputs": [], "source": [ @@ -153,7 +153,7 @@ }, { "cell_type": "markdown", - "id": "98abe1a9", + "id": "a06fca7d", "metadata": {}, "source": [ "### 🌱 Load animal portraits from HuggingFace\n", @@ -166,7 +166,7 @@ { "cell_type": "code", "execution_count": null, - "id": "233f483b", + "id": "685c5384", "metadata": {}, "outputs": [], "source": [ @@ -199,7 +199,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6b1a7b59", + "id": "d4f17648", "metadata": {}, "outputs": [], "source": [ @@ -216,7 +216,7 @@ }, { "cell_type": "markdown", - "id": "2956a5a6", + "id": "12993667", "metadata": {}, "source": [ "### πŸ—οΈ Build the configuration\n", @@ -233,7 +233,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f79ffa72", + "id": "1d9acb93", "metadata": {}, "outputs": [], "source": [ @@ -321,7 +321,7 @@ }, { "cell_type": "markdown", - "id": "0cba69c0", + "id": "1ee3daba", "metadata": {}, "source": [ "### πŸ” Preview: quick iteration\n", @@ -332,7 +332,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ec669ae2", + "id": "4e8d12a9", "metadata": {}, "outputs": [], "source": [ @@ -342,7 +342,7 @@ { "cell_type": "code", "execution_count": null, - "id": "41ac4a95", + "id": "c83a0098", "metadata": {}, "outputs": [], "source": [ @@ -353,7 +353,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6f041d9d", + "id": "8f62881c", "metadata": {}, "outputs": [], "source": [ @@ -362,7 +362,7 @@ }, { "cell_type": "markdown", - "id": "483fa24a", + "id": "e690f53e", "metadata": { "lines_to_next_cell": 2 }, @@ -375,7 +375,7 @@ { "cell_type": "code", "execution_count": null, - "id": "dd4d7dff", + "id": "f1e7371a", "metadata": {}, "outputs": [], "source": [ @@ -411,7 +411,7 @@ { "cell_type": "code", "execution_count": null, - "id": "af08dc6c", + "id": "00901a02", "metadata": {}, "outputs": [], "source": [ @@ -421,7 +421,7 @@ }, { "cell_type": "markdown", - "id": "9ee15c83", + "id": "f832d3dd", "metadata": {}, "source": [ "### πŸ†™ Create at scale\n", @@ -432,7 +432,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9f0d27f8", + "id": "8023a215", "metadata": {}, "outputs": [], "source": [ @@ -442,7 +442,7 @@ { "cell_type": "code", "execution_count": null, - "id": "cc17414a", + "id": "d6c2e1b0", "metadata": {}, "outputs": [], "source": [ @@ -453,7 +453,7 @@ { "cell_type": "code", "execution_count": null, - "id": "849c03b6", + "id": "34787071", "metadata": {}, "outputs": [], "source": [ @@ -463,7 +463,7 @@ }, { "cell_type": "markdown", - "id": "b7385f02", + "id": "bb907145", "metadata": {}, "source": [ "## ⏭️ Next steps\n", diff --git a/docs/scripts/generate_colab_notebooks.py b/docs/scripts/generate_colab_notebooks.py index b490ccb9e..c0a8819b1 100644 --- a/docs/scripts/generate_colab_notebooks.py +++ b/docs/scripts/generate_colab_notebooks.py @@ -6,6 +6,9 @@ 1. Injects Colab-specific setup cells (pip install, API key from secrets) 2. Injects cells before the "Import the essentials" section 3. Saves the result as .ipynb files in docs/colab_notebooks + +When --executed-dir is provided and an executed .ipynb exists there, uses it as the +base (preserving cell outputs for display in Fern docs). Otherwise reads from source .py. """ from __future__ import annotations @@ -14,6 +17,7 @@ from pathlib import Path import jupytext +import nbformat from nbformat import NotebookNode from nbformat.v4 import new_code_cell, new_markdown_cell @@ -99,30 +103,41 @@ def process_notebook(notebook: NotebookNode, source_path: Path) -> NotebookNode: return notebook -def generate_colab_notebook(source_path: Path, output_dir: Path) -> Path: +def generate_colab_notebook( + source_path: Path, output_dir: Path, executed_dir: Path | None = None +) -> Path: """Generate a Colab-compatible notebook from a source file. Args: source_path: Path to the jupytext percent-format Python source file output_dir: Directory to save the output notebook + executed_dir: Optional directory with executed .ipynb files (preserves outputs) Returns: Path to the generated notebook """ - # Read the source file using jupytext - notebook = jupytext.read(source_path) + executed_path = ( + (executed_dir / f"{source_path.stem}.ipynb") if executed_dir else None + ) + + if executed_path and executed_path.exists(): + notebook = nbformat.read(executed_path, as_version=4) + else: + notebook = jupytext.read(source_path) - # Process the notebook for Colab notebook = process_notebook(notebook, source_path) - # Determine output path output_path = output_dir / f"{source_path.stem}.ipynb" - - # Ensure output directory exists output_dir.mkdir(parents=True, exist_ok=True) - # Write the notebook - jupytext.write(notebook, output_path, config={"metadata": {"jupytext": {"cell_metadata_filter": "-id"}}}) + if executed_path and executed_path.exists(): + nbformat.write(notebook, output_path) + else: + jupytext.write( + notebook, + output_path, + config={"metadata": {"jupytext": {"cell_metadata_filter": "-id"}}}, + ) return output_path @@ -142,6 +157,12 @@ def main() -> None: default=Path("docs/colab_notebooks"), help="Directory to save Colab notebooks (default: docs/colab_notebooks)", ) + parser.add_argument( + "--executed-dir", + type=Path, + default=None, + help="Directory with executed .ipynb files (preserves outputs for Fern docs)", + ) parser.add_argument( "--files", nargs="*", @@ -165,6 +186,8 @@ def main() -> None: print(f"πŸ““ Generating Colab notebooks from {len(source_files)} source file(s)...") print(f" Source: {args.source_dir}") print(f" Output: {args.output_dir}") + if args.executed_dir: + print(f" Executed (outputs): {args.executed_dir}") print() for source_path in source_files: @@ -173,7 +196,9 @@ def main() -> None: continue try: - output_path = generate_colab_notebook(source_path, args.output_dir) + output_path = generate_colab_notebook( + source_path, args.output_dir, args.executed_dir + ) print(f"βœ… {source_path.name} β†’ {output_path.name}") except Exception as e: print(f"❌ {source_path.name}: {e}") diff --git a/fern/README.md b/fern/README.md new file mode 100644 index 000000000..c7729e2a9 --- /dev/null +++ b/fern/README.md @@ -0,0 +1,185 @@ +# Fern Documentation Cheat Sheet + +This folder contains the Fern Docs configuration for NeMo Data Designer. + +## πŸ“¦ Installation + +```bash +# Install Fern CLI globally +npm install -g fern-api + +# Or use npx (no install needed) +npx fern-api --version +``` + +## πŸ” Local Preview + +**Before first run (for NotebookViewer pages):** +```bash +make generate-colab-notebooks # docs/colab_notebooks/*.ipynb +make generate-fern-notebooks # fern/components/notebooks/*.ts +``` + +```bash +# From the fern/ directory +cd fern/ +fern docs dev + +# Or from project root +fern docs dev --project ./fern +``` + +The docs will be available at `http://localhost:3000`. + +See [DOCS-VS-FERN.md](DOCS-VS-FERN.md) for docs/ vs fern/ comparison. See [components/README.md](components/README.md) for custom components (Authors, MetricsTable, NotebookViewer, etc.). + +## πŸ“ Folder Structure + +``` +fern/ +β”œβ”€β”€ docs.yml # Global config (title, colors, versions) +β”œβ”€β”€ fern.config.json # Fern CLI config (org name) +β”œβ”€β”€ versions/ +β”‚ β”œβ”€β”€ v0.3.3.yml # Navigation for v0.3.3 +β”‚ └── v0.4.0.yml # Navigation for v0.4.0 +β”œβ”€β”€ v0.3.3/ +β”‚ └── pages/ # MDX content for v0.3.3 +β”œβ”€β”€ v0.4.0/ +β”‚ └── pages/ # MDX content for v0.4.0 +└── assets/ # Shared images, favicons +``` + +## πŸ”„ Bumping the Version + +When releasing a new version (e.g., v0.5.0): + +### 1. Copy the previous version's content +```bash +cp -r fern/v0.4.0 fern/v0.5.0 +``` + +### 2. Create the navigation file +```bash +cp fern/versions/v0.4.0.yml fern/versions/v0.5.0.yml +``` + +### 3. Update paths in `versions/v0.5.0.yml` +Change all `../v0.4.0/pages/` β†’ `../v0.5.0/pages/` + +### 4. Add the new version to `docs.yml` +```yaml +versions: + - display-name: v0.5.0 + path: versions/v0.5.0.yml + slug: v0.5.0 + - display-name: v0.4.0 + path: versions/v0.4.0.yml + slug: v0.4.0 + # ... older versions +``` + +### 5. Make your content changes +Edit files in `fern/v0.5.0/pages/` + +## ✏️ Editing Content + +### Adding a new page + +1. Create the MDX file in the appropriate version folder: + ```bash + touch fern/v0.3.3/pages/concepts/new-feature.mdx + ``` + +2. Add frontmatter: + ```mdx + --- + title: New Feature + description: Description for SEO. + --- + + Content starts here... + ``` + +3. Add to navigation in `versions/v0.3.3.yml`: + ```yaml + - page: New Feature + path: ../v0.3.3/pages/concepts/new-feature.mdx + ``` + +### MDX Components + +```mdx +# Callouts +Informational note +Helpful tip +Warning message +Info callout + +# Tabs + + + ```python + print("hello") + ``` + + + ```javascript + console.log("hello") + ``` + + + +# Cards + + + Description + + +``` + +## πŸš€ Deploying + +```bash +# Generate static docs (for CI/CD) +fern generate --docs + +# Deploy to Fern hosting +fern docs deploy +``` + +## πŸ”— Useful Links + +- [Fern Docs](https://buildwithfern.com/learn/docs) +- [MDX Components Reference](https://buildwithfern.com/learn/docs/components) +- [Versioning Guide](https://buildwithfern.com/learn/docs/configuration/versions) +- [Navigation Configuration](https://buildwithfern.com/learn/docs/configuration/navigation) + +## πŸ““ NotebookViewer Component + +Renders Jupyter notebooks in Fern docs with a Colab badge. Source: `docs/notebook_source/*.py` (Jupytext percent-format: `# %%` code, `# %% [markdown]` markdown). + +**Pipeline:** Jupytext reads `.py` β†’ `generate_colab_notebooks` injects Colab setup β†’ `generate-fern-notebooks` runs `ipynb-to-fern-json.py` β†’ outputs `fern/components/notebooks/*.json` + `*.ts`. Makefile passes `-o fern/components/notebooks/$$name.json`; the script writes `.ts` alongside. + +**Commands:** +```bash +make generate-colab-notebooks # Colab-ready .ipynb +make generate-fern-notebooks # Runs colab first, then converts to .ts +make generate-fern-notebooks-with-outputs # Execute first (needs API key), then convert +``` + +**Add a new tutorial:** Add `N-name.py` to `docs/notebook_source/`, run pipeline, add MDX page that imports from `@/components/notebooks/N-name`. + +**Files:** `NotebookViewer.tsx`, `fern/components/notebooks/*.ts` (generated), `fern/scripts/ipynb-to-fern-json.py`, `notebook-viewer.css`. Requires Fern Pro/Enterprise. + +## ⚠️ Common Issues + +### "EISDIR: illegal operation on a directory" +- Check that all `path:` values point to `.mdx` files, not directories + +### Page not showing +- Verify the page is listed in the version's navigation file +- Check the path is correct (relative to the versions/ folder) + +### Version selector not appearing +- Ensure `versions:` is defined in `docs.yml` +- Each version needs a valid `.yml` file in `versions/` diff --git a/fern/assets/NVIDIA_dark.svg b/fern/assets/NVIDIA_dark.svg new file mode 100644 index 000000000..04850d9d6 --- /dev/null +++ b/fern/assets/NVIDIA_dark.svg @@ -0,0 +1,35 @@ + + + + + + + + + + + + + + + + + + + diff --git a/fern/assets/NVIDIA_light.svg b/fern/assets/NVIDIA_light.svg new file mode 100644 index 000000000..9ee045c3e --- /dev/null +++ b/fern/assets/NVIDIA_light.svg @@ -0,0 +1,34 @@ + + + + + + + + + + + + + + + + + + + diff --git a/fern/assets/NVIDIA_symbol.svg b/fern/assets/NVIDIA_symbol.svg new file mode 100644 index 000000000..c0507afe0 --- /dev/null +++ b/fern/assets/NVIDIA_symbol.svg @@ -0,0 +1,22 @@ + + + + + + + + + + + + + + + diff --git a/fern/assets/favicon.png b/fern/assets/favicon.png new file mode 100644 index 000000000..11c795d3e Binary files /dev/null and b/fern/assets/favicon.png differ diff --git a/fern/assets/images/deployment-enterprise-gateway.png b/fern/assets/images/deployment-enterprise-gateway.png new file mode 100644 index 000000000..8b337406d Binary files /dev/null and b/fern/assets/images/deployment-enterprise-gateway.png differ diff --git a/fern/assets/images/deployment-library-decentralized.png b/fern/assets/images/deployment-library-decentralized.png new file mode 100644 index 000000000..cc3a29623 Binary files /dev/null and b/fern/assets/images/deployment-library-decentralized.png differ diff --git a/fern/assets/images/deployment-microservice.png b/fern/assets/images/deployment-microservice.png new file mode 100644 index 000000000..e870278c3 Binary files /dev/null and b/fern/assets/images/deployment-microservice.png differ diff --git a/fern/assets/images/design-principles-hero.png b/fern/assets/images/design-principles-hero.png new file mode 100644 index 000000000..03e6c8d41 Binary files /dev/null and b/fern/assets/images/design-principles-hero.png differ diff --git a/fern/assets/images/rqa-blog.png b/fern/assets/images/rqa-blog.png new file mode 100644 index 000000000..f43fe156e Binary files /dev/null and b/fern/assets/images/rqa-blog.png differ diff --git a/fern/assets/recipes/code_generation/text_to_python.py b/fern/assets/recipes/code_generation/text_to_python.py new file mode 100644 index 000000000..b5cb88d32 --- /dev/null +++ b/fern/assets/recipes/code_generation/text_to_python.py @@ -0,0 +1,318 @@ +from pathlib import Path + +from data_designer.essentials import ( + CategorySamplerParams, + CodeLang, + CodeValidatorParams, + DataDesigner, + DataDesignerConfigBuilder, + LLMCodeColumnConfig, + LLMJudgeColumnConfig, + LLMTextColumnConfig, + SamplerColumnConfig, + SamplerType, + Score, + SubcategorySamplerParams, + ValidationColumnConfig, + ValidatorType, +) +from data_designer.interface.results import DatasetCreationResults + + +def build_config(model_alias: str) -> DataDesignerConfigBuilder: + config_builder = DataDesignerConfigBuilder() + + config_builder.add_column( + SamplerColumnConfig( + name="industry_sector", + sampler_type=SamplerType.CATEGORY, + params=CategorySamplerParams( + values=[ + "Healthcare", + "Finance", + "Technology", + ], + ), + ), + ) + + config_builder.add_column( + SamplerColumnConfig( + name="topic", + sampler_type=SamplerType.SUBCATEGORY, + params=SubcategorySamplerParams( + category="industry_sector", + values={ + "Healthcare": [ + "Electronic Health Records (EHR) Systems", + "Telemedicine Platforms", + "AI-Powered Diagnostic Tools", + ], + "Finance": [ + "Fraud Detection Software", + "Automated Trading Systems", + "Personal Finance Apps", + ], + "Technology": [ + "Cloud Computing Platforms", + "Artificial Intelligence and Machine Learning Platforms", + "DevOps and CI/CD Tools", + ], + }, + ), + ), + ) + + config_builder.add_column( + SamplerColumnConfig( + name="code_complexity", + sampler_type=SamplerType.CATEGORY, + params=CategorySamplerParams( + values=[ + "Beginner", + "Intermediate", + "Advanced", + ], + ), + ), + ) + + config_builder.add_column( + SamplerColumnConfig( + name="code_concept", + sampler_type=SamplerType.SUBCATEGORY, + params=SubcategorySamplerParams( + category="code_complexity", + values={ + "Beginner": [ + "Variables", + "Data Types", + "Functions", + "Loops", + "Classes", + ], + "Intermediate": [ + "List Comprehensions", + "Object-oriented programming", + "Lambda Functions", + "Web frameworks", + "Pandas", + ], + "Advanced": [ + "Multithreading", + "Context Managers", + "Generators", + ], + }, + ), + ), + ) + + config_builder.add_column( + SamplerColumnConfig( + name="instruction_phrase", + sampler_type=SamplerType.CATEGORY, + params=CategorySamplerParams( + values=[ + "Write a function that", + "Create a class that", + "Implement a script", + "Can you create a function", + "Develop a module that", + ], + ), + ), + ) + + config_builder.add_column( + LLMTextColumnConfig( + name="instruction", + model_alias=model_alias, + system_prompt=("You are an expert at generating clear and specific programming tasks."), + prompt=( + "Generate an instruction to create Python code that solves a specific problem.\n" + "Each instruction should begin with one of the following phrases: {{ instruction_phrase }}.\n\n" + "Important Guidelines:\n" + "* Industry Relevance: Ensure the instruction pertains to the {{ industry_sector }} sector and {{ topic }} topic.\n" + "* Code Complexity: Tailor the instruction to the {{ code_complexity }} level. Utilize relevant {{ code_concept }} where appropriate to match the complexity level.\n" + "* Clarity and Specificity: Make the problem statement clear and unambiguous. Provide sufficient context to understand the requirements without being overly verbose.\n" + "* Response Formatting: Do not include any markers such as ### Response ### in the instruction.\n" + ), + ) + ) + + config_builder.add_column( + LLMCodeColumnConfig( + name="code_implementation", + model_alias=model_alias, + code_lang=CodeLang.PYTHON, + system_prompt=( + "You are an expert Python programmer who writes clean, efficient, and well-documented code." + ), + prompt=( + "Write Python code for the following instruction:\n" + "Instruction: {{ instruction }}\n\n" + "Important Guidelines:\n" + "* Code Quality: Your code should be clean, complete, self-contained, and accurate.\n" + "* Code Validity: Please ensure that your Python code is executable and does not contain any errors.\n" + "* Packages: Remember to import any necessary libraries, and to use all libraries you import.\n" + "* Complexity & Concepts: The code should be written at a {{ code_complexity }} level, making use of concepts such as {{code_concept}}.\n" + ), + ) + ) + + config_builder.add_column( + LLMTextColumnConfig( + name="instruction", + model_alias=model_alias, + system_prompt=("You are an expert at generating clear and specific programming tasks."), + prompt=( + "Generate an instruction to create Python code that solves a specific problem.\n" + "Each instruction should begin with one of the following phrases: {{ instruction_phrase }}.\n\n" + "Important Guidelines:\n" + "* Industry Relevance: Ensure the instruction pertains to the {{ industry_sector }} sector and {{ topic }} topic.\n" + "* Code Complexity: Tailor the instruction to the {{ code_complexity }} level. Utilize relevant {{ code_concept }} where appropriate to match the complexity level.\n" + "* Clarity and Specificity: Make the problem statement clear and unambiguous. Provide sufficient context to understand the requirements without being overly verbose.\n" + "* Response Formatting: Do not include any markers such as ### Response ### in the instruction.\n" + ), + ) + ) + + config_builder.add_column( + LLMCodeColumnConfig( + name="code_implementation", + model_alias=model_alias, + code_lang=CodeLang.PYTHON, + system_prompt=( + "You are an expert Python programmer who writes clean, efficient, and well-documented code." + ), + prompt=( + "Write Python code for the following instruction:\n" + "Instruction: {{ instruction }}\n\n" + "Important Guidelines:\n" + "* Code Quality: Your code should be clean, complete, self-contained, and accurate.\n" + "* Code Validity: Please ensure that your Python code is executable and does not contain any errors.\n" + "* Packages: Remember to import any necessary libraries, and to use all libraries you import.\n" + "* Complexity & Concepts: The code should be written at a {{ code_complexity }} level, making use of concepts such as {{ code_concept }}.\n" + ), + ) + ) + + config_builder.add_column( + LLMJudgeColumnConfig( + name="code_judge_result", + model_alias=model_alias, + prompt=TEXT_TO_PYTHON_JUDGE_TEMPLATE, + scores=python_scoring, + ) + ) + + config_builder.add_column( + ValidationColumnConfig( + name="code_validity_result", + validator_type=ValidatorType.CODE, + target_columns=["code_implementation"], + validator_params=CodeValidatorParams( + code_lang=CodeLang.PYTHON, + ), + batch_size=100, + ) + ) + + return config_builder + + +def create_dataset( + config_builder: DataDesignerConfigBuilder, + num_records: int, + artifact_path: Path | str | None = None, +) -> DatasetCreationResults: + data_designer = DataDesigner(artifact_path=artifact_path) + results = data_designer.create(config_builder, num_records=num_records) + return results + + +TEXT_TO_PYTHON_JUDGE_TEMPLATE = """\ +You are an expert in Python programming, with specialized knowledge in software engineering, data science, and algorithmic problem-solving. + +You think about potential flaws and errors in the code. You are a tough critic, but a fair one. + +Take a deep breath and use the Python Code Quality Rubric below to score the **Generated Python Code** based on the INSTRUCTIONS. + +#### INSTRUCTIONS +The Generated Python Code should be a valid response to the Natural Language Prompt below + +Natural Language Prompt: +{{ instruction }} + +Generated Python Code +{{ code_implementation }} +""" + + +python_scoring = [ + Score( + name="Relevance", + description="Adherence to INSTRUCTIONS and CONTEXT", + options={ + 4: "Perfectly meets all specified requirements.", + 3: "Meets most requirements with minor deviations.", + 2: "Moderate deviation from the instructions.", + 1: "Significant deviations from the instructions.", + 0: "Does not adhere to the instructions.", + }, + ), + Score( + name="Pythonic", + description="Pythonic Code and Best Practices (Does the code follow Python conventions and best practices?)", + options={ + 4: "The code exemplifies Pythonic principles, making excellent use of Python-specific constructs, standard library modules and programming idioms; follows all relevant PEPs.", + 3: "The code closely follows Python conventions and adheres to many best practices; good use of Python-specific constructs, standard library modules and programming idioms.", + 2: "The code generally follows Python conventions but has room for better alignment with Pythonic practices.", + 1: "The code loosely follows Python conventions, with several deviations from best practices.", + 0: "The code does not follow Python conventions or best practices, using non-Pythonic approaches.", + }, + ), + Score( + name="Readability", + description="Readability and Maintainability (Is the Python code easy to understand and maintain?)", + options={ + 4: ( + "The code is excellently formatted, follows PEP 8 guidelines, is elegantly concise and clear, uses meaningful variable names, " + "ensuring high readability and ease of maintenance; organizes complex logic well. Docstrings are given in a Google Docstring format." + ), + 3: "The code is well-formatted in the sense of code-as-documentation, making it relatively easy to understand and maintain; uses descriptive names and organizes logic clearly.", + 2: "The code is somewhat readable with basic formatting and some comments, but improvements are needed; needs better use of descriptive names and organization.", + 1: "The code has minimal formatting, making it hard to understand; lacks meaningful names and organization.", + 0: "The code is unreadable, with no attempt at formatting or description.", + }, + ), + Score( + name="Efficiency", + description="Efficiency and Performance (Is the code optimized for performance?)", + options={ + 4: "The solution is highly efficient, using appropriate data structures and algorithms; avoids unnecessary computations and optimizes for both time and space complexity.", + 3: "The solution is efficient, with good use of Python's built-in functions and libraries; minor areas for optimization.", + 2: "The solution is moderately efficient, but misses some opportunities for optimization; uses some inefficient patterns.", + 1: "The solution shows poor efficiency, with notable performance issues; lacks effective optimization techniques.", + 0: "The solution is highly inefficient; overlooks fundamental optimization practices, resulting in significant performance issues.", + }, + ), +] + + +if __name__ == "__main__": + from argparse import ArgumentParser + + parser = ArgumentParser() + parser.add_argument("--model-alias", type=str, default="openai-text") + parser.add_argument("--num-records", type=int, default=5) + parser.add_argument("--artifact-path", type=str, default=None) + args = parser.parse_args() + + config_builder = build_config(model_alias=args.model_alias) + results = create_dataset(config_builder, num_records=args.num_records, artifact_path=args.artifact_path) + + print(f"Dataset saved to: {results.artifact_storage.final_dataset_path}") + + results.load_analysis().to_report() diff --git a/fern/assets/recipes/code_generation/text_to_sql.py b/fern/assets/recipes/code_generation/text_to_sql.py new file mode 100644 index 000000000..a0fbf6e51 --- /dev/null +++ b/fern/assets/recipes/code_generation/text_to_sql.py @@ -0,0 +1,323 @@ +from pathlib import Path + +from data_designer.essentials import ( + CategorySamplerParams, + CodeLang, + CodeValidatorParams, + DataDesigner, + DataDesignerConfigBuilder, + LLMCodeColumnConfig, + LLMJudgeColumnConfig, + LLMTextColumnConfig, + SamplerColumnConfig, + SamplerType, + Score, + SubcategorySamplerParams, + ValidationColumnConfig, + ValidatorType, +) +from data_designer.interface.results import DatasetCreationResults + + +def build_config(model_alias: str) -> DataDesignerConfigBuilder: + config_builder = DataDesignerConfigBuilder() + + config_builder.add_column( + SamplerColumnConfig( + name="industry_sector", + sampler_type=SamplerType.CATEGORY, + params=CategorySamplerParams( + values=["Healthcare", "Finance", "Technology"], + ), + ) + ) + + config_builder.add_column( + SamplerColumnConfig( + name="topic", + sampler_type=SamplerType.SUBCATEGORY, + params=SubcategorySamplerParams( + category="industry_sector", + values={ + "Healthcare": [ + "Electronic Health Records (EHR) Systems", + "Telemedicine Platforms", + "AI-Powered Diagnostic Tools", + ], + "Finance": [ + "Fraud Detection Software", + "Automated Trading Systems", + "Personal Finance Apps", + ], + "Technology": [ + "Cloud Computing Platforms", + "Artificial Intelligence and Machine Learning Platforms", + "DevOps and CI/CD Tools", + ], + }, + ), + ) + ) + + config_builder.add_column( + SamplerColumnConfig( + name="sql_complexity", + sampler_type=SamplerType.CATEGORY, + params=CategorySamplerParams( + values=["Beginner", "Intermediate", "Advanced"], + ), + ) + ) + + config_builder.add_column( + SamplerColumnConfig( + name="sql_concept", + sampler_type=SamplerType.SUBCATEGORY, + params=SubcategorySamplerParams( + category="sql_complexity", + values={ + "Beginner": [ + "Basic SELECT Statements", + "WHERE Clauses", + "Basic JOINs", + "INSERT, UPDATE, DELETE", + ], + "Intermediate": [ + "Aggregation Functions", + "Multiple JOINs", + "Subqueries", + "Views", + ], + "Advanced": [ + "Window Functions", + "Common Table Expressions (CTEs)", + "Stored Procedures", + "Query Optimization", + ], + }, + ), + ) + ) + + config_builder.add_column( + SamplerColumnConfig( + name="sql_task_type", + sampler_type=SamplerType.CATEGORY, + params=CategorySamplerParams( + values=[ + "Data Retrieval", + "Data Manipulation", + "Analytics and Reporting", + "Data Transformation", + ], + ), + ) + ) + + config_builder.add_column( + SamplerColumnConfig( + name="instruction_phrase", + sampler_type=SamplerType.CATEGORY, + params=CategorySamplerParams( + values=[ + "Write an SQL query that", + "Create an SQL statement to", + "Develop an SQL query to", + "Can you write SQL that", + "Formulate an SQL query that", + ], + ), + ) + ) + + config_builder.add_column( + LLMTextColumnConfig( + name="sql_prompt", + model_alias=model_alias, + system_prompt="You are an expert at generating clear and specific SQL tasks.", + prompt=SQL_PROMPT_TEXT, + ) + ) + + config_builder.add_column( + LLMCodeColumnConfig( + name="sql_context", + model_alias=model_alias, + code_lang=CodeLang.SQL_ANSI, + system_prompt=( + "You are an expert SQL database designer who creates clean, efficient, and " + "well-structured database schemas." + ), + prompt=SQL_CONTEXT_TEXT, + ) + ) + + config_builder.add_column( + LLMCodeColumnConfig( + name="sql", + model_alias=model_alias, + code_lang=CodeLang.SQL_ANSI, + system_prompt="You are an expert SQL programmer who writes clean, efficient, and well-structured queries.", + prompt=SQL_CODE_TEXT, + ) + ) + + config_builder.add_column( + ValidationColumnConfig( + name="code_validity_result", + validator_type=ValidatorType.CODE, + target_columns=["sql"], + validator_params=CodeValidatorParams( + code_lang=CodeLang.SQL_ANSI, + ), + batch_size=100, + ) + ) + + config_builder.add_column( + LLMJudgeColumnConfig( + name="code_judge_result", + model_alias=model_alias, + prompt=TEXT_TO_SQL_JUDGE_TEMPLATE, + scores=sql_scoring, + ) + ) + + return config_builder + + +def create_dataset( + config_builder: DataDesignerConfigBuilder, + num_records: int, + artifact_path: Path | str | None = None, +) -> DatasetCreationResults: + data_designer = DataDesigner(artifact_path=artifact_path) + results = data_designer.create(config_builder, num_records=num_records) + return results + + +SQL_PROMPT_TEXT = ( + "Generate an instruction to create SQL code that solves a specific problem.\n" + "Each instruction should begin with one of the following phrases: {{instruction_phrase}}.\n\n" + "Important Guidelines:\n" + "* Industry Relevance: Ensure the instruction pertains to the {{industry_sector}} sector and {{topic}} topic.\n" + "* SQL Complexity: Tailor the instruction to the {{sql_complexity}} level. Utilize relevant {{sql_concept}} " + "where appropriate to match the complexity level.\n" + "* Task Type: The instruction should involve a {{sql_task_type}} task.\n" + "* Clarity and Specificity: Make the problem statement clear and unambiguous. Provide sufficient context to " + "understand the requirements without being overly verbose.\n" + "* Response Formatting: Do not include any markers such as ### Response ### in the instruction.\n" +) + +SQL_CONTEXT_TEXT = ( + "Generate the SQL for creating database tables that would be relevant for the following instruction:\n" + "Instruction: {{sql_prompt}}\n\n" + "Important Guidelines:\n" + "* Relevance: Ensure all tables are directly related to the {{industry_sector}} sector and {{topic}} topic.\n" + "* Completeness: Include all essential columns with appropriate data types, primary/foreign keys, and necessary constraints.\n" + "* Realism: Use realistic table structures typical for the specified industry.\n" + "* Executable SQL: Provide complete CREATE TABLE statements that can be run without modification.\n" + "* Consistency: Use consistent naming conventions (e.g., snake_case for table and column names).\n" + "* Sample Data: Include INSERT statements with sample data that makes sense for the tables (at least 5-10 rows per table)." +) + +SQL_CODE_TEXT = ( + "Write SQL code for the following instruction based on the provided database context:\n" + "Instruction: {{sql_prompt}}\n\n" + "Database Context:\n" + "{{sql_context}}\n\n" + "Important Guidelines:\n" + "* Code Quality: Your SQL should be clean, complete, self-contained and accurate.\n" + "* Code Validity: Please ensure that your SQL code is executable and does not contain any errors.\n" + "* Context: Base your query on the provided database context. Only reference tables and columns that " + "exist in the context.\n" + "* Complexity & Concepts: The SQL should be written at a {{sql_complexity}} level, making use of " + "concepts such as {{sql_concept}}.\n" + "* Task Type: Ensure your solution implements the appropriate {{sql_task_type}} operation.\n" + "* Comments: Include brief comments explaining the key parts of your query.\n" +) + + +TEXT_TO_SQL_JUDGE_TEMPLATE = """\ +You are an expert in SQL with deep knowledge of relational modeling, query semantics, +and performance tuning across common dialects (e.g., PostgreSQL, MySQL, SQLite, SQL Server). +You think critically about correctness, readability, and efficiency. + +Use the SQL Query Quality Rubric below to score the **Generated SQL Query** based on the INSTRUCTIONS. + +#### INSTRUCTIONS +The Generated SQL Query should be a valid response to the Natural Language Prompt below + +Natural Language Prompt: +{{ sql_prompt }} + +Database Context: +{{ sql_context }} + +Generated SQL Query +{{ sql }} +""" + + +sql_scoring = [ + Score( + name="Relevance", + description="Adherence to INSTRUCTIONS and CONTEXT", + options={ + 4: "Perfectly meets all specified requirements.", + 3: "Meets most requirements with minor deviations.", + 2: "Moderate deviation from the instructions.", + 1: "Significant deviations from the instructions.", + 0: "Does not adhere to the instructions.", + }, + ), + Score( + name="SQL Correctness", + description="Syntax and semantic correctness; returns the intended result", + options={ + 4: "Valid SQL with correct joins, filters, grouping/aggregation, and NULL handling; produces the intended result set under the stated/implicit dialect.", + 3: "Generally correct with minor issues (e.g., edge-case NULLs, minor grouping detail) but still likely yields the intended result.", + 2: "Partially correct; noticeable semantic mistakes (joins, grouping, filters) that may change results or fail in edge cases.", + 1: "Largely incorrect; major semantic or syntactic errors likely causing failure or wrong results.", + 0: "Invalid SQL or unrelated to the task; will not run or cannot produce a meaningful result.", + }, + ), + Score( + name="Readability", + description="Formatting, clarity, and maintainability", + options={ + 4: "Cleanly formatted (keywords/clauses consistently styled), clear structure (CTEs/subqueries where helpful), meaningful table/column aliases, and concise.", + 3: "Generally readable with consistent formatting and understandable aliases; could be organized slightly better.", + 2: "Somewhat readable but inconsistent formatting or confusing aliasing; structure is harder to follow.", + 1: "Poorly formatted and hard to read; unclear structure and aliasing.", + 0: "Unreadable or chaotic; no meaningful structure or styling.", + }, + ), + Score( + name="Efficiency", + description="Query performance best practices", + options={ + 4: "Uses sargable predicates, appropriate joins, selective filters early, avoids SELECT *, unnecessary DISTINCT, and wasteful subqueries; likely to use indexes effectively.", + 3: "Mostly efficient; minor opportunities for improvement (e.g., simplifying expressions, reducing data early).", + 2: "Moderate inefficiencies (e.g., non-sargable filters, unnecessary nested subqueries, broad SELECT *).", + 1: "Notably inefficient patterns likely causing large scans or poor plans.", + 0: "Highly inefficient; ignores basic best practices and likely to perform very poorly.", + }, + ), +] + +if __name__ == "__main__": + from argparse import ArgumentParser + + parser = ArgumentParser() + parser.add_argument("--model-alias", type=str, default="openai-text") + parser.add_argument("--num-records", type=int, default=5) + parser.add_argument("--artifact-path", type=str, default=None) + args = parser.parse_args() + + config_builder = build_config(model_alias=args.model_alias) + results = create_dataset(config_builder, num_records=args.num_records, artifact_path=args.artifact_path) + + print(f"Dataset saved to: {results.artifact_storage.final_dataset_path}") + + results.load_analysis().to_report() diff --git a/fern/assets/recipes/mcp_and_tooluse/basic_mcp.py b/fern/assets/recipes/mcp_and_tooluse/basic_mcp.py new file mode 100644 index 000000000..150e26d92 --- /dev/null +++ b/fern/assets/recipes/mcp_and_tooluse/basic_mcp.py @@ -0,0 +1,240 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# /// script +# requires-python = ">=3.10" +# dependencies = [ +# "data-designer", +# "mcp", +# ] +# /// +"""Basic MCP Recipe: Simple Tool Use Example + +This recipe demonstrates the minimal MCP tool-calling workflow with Data Designer: + +1) Define a simple MCP server with basic tools (get_fact, add_numbers) +2) Configure Data Designer to use the MCP tools +3) Generate data that requires tool calls to complete + +Prerequisites: + - OPENAI_API_KEY environment variable for OpenAI provider model aliases. + - NVIDIA_API_KEY environment variable for NVIDIA provider model aliases (default model alias is "nvidia-text"). + +Run: + # Basic usage (generates 2 records by default) + uv run basic_mcp.py + + # For help message and available options + uv run basic_mcp.py --help +""" + +from __future__ import annotations + +import argparse +import json +import os +import sys +from pathlib import Path + +from mcp.server.fastmcp import FastMCP + +import data_designer.config as dd +from data_designer.interface import DataDesigner + +MCP_SERVER_NAME = "basic-tools" + + +# ============================================================================= +# MCP Server Definition +# ============================================================================= + +mcp_server = FastMCP(MCP_SERVER_NAME) + +# Simple knowledge base for the get_fact tool +FACTS = { + "python": "Python was created by Guido van Rossum and first released in 1991.", + "earth": "Earth is the third planet from the Sun and has one natural satellite, the Moon.", + "water": "Water (H2O) freezes at 0Β°C (32Β°F) and boils at 100Β°C (212Β°F) at sea level.", + "light": "The speed of light in a vacuum is approximately 299,792 kilometers per second.", +} + + +@mcp_server.tool() +def get_fact(topic: str) -> str: + """Get a fact about a topic from the knowledge base. + + Args: + topic: The topic to look up (e.g., "python", "earth", "water", "light") + + Returns: + A fact about the topic, or an error message if not found. + """ + topic_lower = topic.lower() + if topic_lower in FACTS: + return json.dumps({"topic": topic, "fact": FACTS[topic_lower]}) + return json.dumps({"error": f"No fact found for topic: {topic}", "available_topics": list(FACTS.keys())}) + + +@mcp_server.tool() +def add_numbers(a: float, b: float) -> str: + """Add two numbers together. + + Args: + a: First number + b: Second number + + Returns: + The sum of the two numbers. + """ + result = a + b + return json.dumps({"a": a, "b": b, "sum": result}) + + +@mcp_server.tool() +def list_topics() -> str: + """List all available topics in the knowledge base. + + Returns: + List of available topics. + """ + return json.dumps({"topics": list(FACTS.keys())}) + + +# ============================================================================= +# Data Designer Configuration +# ============================================================================= + + +def build_config(model_alias: str, provider_name: str) -> dd.DataDesignerConfigBuilder: + """Build the Data Designer configuration for basic tool use.""" + tool_config = dd.ToolConfig( + tool_alias="basic-tools", + providers=[provider_name], + allow_tools=["get_fact", "add_numbers", "list_topics"], + max_tool_call_turns=5, + timeout_sec=30.0, + ) + + config_builder = dd.DataDesignerConfigBuilder(tool_configs=[tool_config]) + + # Add a seed column with topics to look up + config_builder.add_column( + dd.SamplerColumnConfig( + name="topic", + sampler_type=dd.SamplerType.CATEGORY, + params=dd.CategorySamplerParams(values=["python", "earth", "water", "light"]), + ) + ) + + # Add a column that uses the get_fact tool + config_builder.add_column( + dd.LLMTextColumnConfig( + name="fact_response", + model_alias=model_alias, + prompt=( + "Use the get_fact tool to look up information about '{{ topic }}', " + "then provide a one-sentence summary of what you learned." + ), + system_prompt="You must call the get_fact tool before answering. Only use information from tool results.", + tool_alias="basic-tools", + with_trace=dd.TraceType.ALL_MESSAGES, + ) + ) + + # Add a column that uses the add_numbers tool + config_builder.add_column( + dd.SamplerColumnConfig( + name="num_a", + sampler_type=dd.SamplerType.UNIFORM, + params=dd.UniformSamplerParams(low=1, high=100), + ) + ) + + config_builder.add_column( + dd.SamplerColumnConfig( + name="num_b", + sampler_type=dd.SamplerType.UNIFORM, + params=dd.UniformSamplerParams(low=1, high=100), + ) + ) + + config_builder.add_column( + dd.LLMTextColumnConfig( + name="math_response", + model_alias=model_alias, + prompt=( + "Use the add_numbers tool to calculate {{ num_a }} + {{ num_b }}, " + "then report the result in a complete sentence." + ), + system_prompt="You must call the add_numbers tool to perform the calculation. Report the exact result.", + tool_alias="basic-tools", + with_trace=dd.TraceType.ALL_MESSAGES, + ) + ) + + return config_builder + + +# ============================================================================= +# Main Entry Points +# ============================================================================= + + +def serve() -> None: + """Run the MCP server (called when launched as subprocess by Data Designer).""" + mcp_server.run() + + +def parse_args() -> argparse.Namespace: + """Parse command line arguments.""" + parser = argparse.ArgumentParser(description="Basic MCP tool use example with Data Designer.") + subparsers = parser.add_subparsers(dest="command") + + # 'serve' subcommand for running the MCP server + subparsers.add_parser("serve", help="Run the MCP server (used by Data Designer)") + + # Default command arguments (demo mode) + parser.add_argument("--model-alias", type=str, default="nvidia-text", help="Model alias to use for generation") + parser.add_argument("--num-records", type=int, default=2, help="Number of records to generate") + # For compatibility with Makefile test-run-recipes target (ignored in demo mode) + parser.add_argument("--artifact-path", type=str, default=None, help=argparse.SUPPRESS) + + return parser.parse_args() + + +def main() -> None: + """Main entry point for the demo.""" + args = parse_args() + + # Handle 'serve' subcommand + if args.command == "serve": + serve() + return + + # Demo mode: run Data Designer with the MCP server + if os.environ.get("NVIDIA_API_KEY") is None and args.model_alias.startswith("nvidia"): + raise RuntimeError("NVIDIA_API_KEY must be set when using NVIDIA model aliases.") + + # Configure MCP provider to run via stdio transport (local subprocess) + mcp_provider = dd.LocalStdioMCPProvider( + name=MCP_SERVER_NAME, + command=sys.executable, + args=[str(Path(__file__).resolve()), "serve"], + ) + + config_builder = build_config( + model_alias=args.model_alias, + provider_name=MCP_SERVER_NAME, + ) + + data_designer = DataDesigner(mcp_providers=[mcp_provider]) + preview_results = data_designer.preview(config_builder, num_records=args.num_records) + + # Display results + print("\n" + "=" * 60) + print("GENERATED DATA") + print("=" * 60) + preview_results.display_sample_record() + + +if __name__ == "__main__": + main() diff --git a/fern/assets/recipes/mcp_and_tooluse/pdf_qa.py b/fern/assets/recipes/mcp_and_tooluse/pdf_qa.py new file mode 100644 index 000000000..024ed4f37 --- /dev/null +++ b/fern/assets/recipes/mcp_and_tooluse/pdf_qa.py @@ -0,0 +1,572 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# /// script +# requires-python = ">=3.10" +# dependencies = [ +# "data-designer", +# "mcp", +# "bm25s", +# "pymupdf", +# "rich", +# ] +# /// +"""MCP + Tool Use Recipe: Document Q&A with BM25S Lexical Search + +This recipe demonstrates an end-to-end MCP tool-calling workflow: + +1) Load one or more PDF documents from URLs or local paths. +2) Index them with BM25S for fast lexical search. +3) Use Data Designer tool calls (`search_docs`) to generate grounded Q&A pairs. + +Prerequisites: + - OPENAI_API_KEY environment variable for OpenAI provider model aliases. + - NVIDIA_API_KEY environment variable for NVIDIA provider model aliases (default model alias is "nvidia-reasoning"). + +Run: + # Basic usage with default sample PDF (generates 4 Q&A pairs) + uv run pdf_qa.py + + # For help message and available options + uv run pdf_qa.py --help +""" + +from __future__ import annotations + +import argparse +import io +import json +import os +import sys +from pathlib import Path +from urllib.parse import urlparse +from urllib.request import urlopen + +import bm25s +import fitz +from mcp.server.fastmcp import FastMCP +from pydantic import BaseModel, Field + +import data_designer.config as dd +from data_designer.config.preview_results import PreviewResults +from data_designer.interface import DataDesigner + +DEFAULT_PDF_URL = "https://research.nvidia.com/labs/nemotron/files/NVIDIA-Nemotron-3-Nano-Technical-Report.pdf" +MCP_SERVER_NAME = "doc-bm25-search" + +# Global state for the BM25 index (populated at server startup) +_bm25_retriever: bm25s.BM25 | None = None +_corpus: list[dict[str, str]] = [] + + +class QAPair(BaseModel): + question: str = Field(..., description="A question grounded in the document text.") + answer: str = Field(..., description="A concise answer grounded in the supporting passage.") + supporting_passage: str = Field( + ..., description="A short excerpt (2-4 sentences) copied from the search result that supports the answer." + ) + citation: str = Field( + ..., description="The citation (e.g. source url, page number, etc) of the supporting passage." + ) + + +class TopicList(BaseModel): + topics: list[str] = Field( + ..., + description="High-level topics covered by the document.", + ) + + +def _is_url(path_or_url: str) -> bool: + """Check if the given string is a URL.""" + parsed = urlparse(path_or_url) + return parsed.scheme in ("http", "https") + + +def _get_source_name(path_or_url: str) -> str: + """Extract a human-readable source name from a path or URL.""" + if _is_url(path_or_url): + parsed = urlparse(path_or_url) + return Path(parsed.path).name or parsed.netloc + return Path(path_or_url).name + + +def extract_pdf_text(path_or_url: str) -> list[dict[str, str]]: + """Extract text from a PDF file or URL, returning a list of passages with metadata. + + Each passage corresponds to a page from the PDF. + + Args: + path_or_url: Either a local file path or a URL to a PDF document. + URLs are streamed directly into memory without saving to disk. + + Returns: + List of passage dictionaries with 'text', 'page', and 'source' keys. + """ + passages: list[dict[str, str]] = [] + source_name = _get_source_name(path_or_url) + + if _is_url(path_or_url): + with urlopen(path_or_url) as response: + pdf_bytes = response.read() + doc = fitz.open(stream=io.BytesIO(pdf_bytes), filetype="pdf") + else: + doc = fitz.open(path_or_url) + + for page_num in range(len(doc)): + page = doc[page_num] + text = page.get_text("text").strip() + if text: + passages.append( + { + "text": text, + "page": str(page_num + 1), + "source": source_name, + } + ) + + doc.close() + return passages + + +def build_bm25_index(passages: list[dict[str, str]]) -> bm25s.BM25: + """Build a BM25S index from the extracted passages.""" + corpus_texts = [p["text"] for p in passages] + corpus_tokens = bm25s.tokenize(corpus_texts, stopwords="en") + + retriever = bm25s.BM25() + retriever.index(corpus_tokens) + + return retriever + + +def initialize_search_index(pdf_sources: list[str]) -> None: + """Load PDFs from paths/URLs and build the BM25 index. + + Args: + pdf_sources: List of PDF file paths or URLs to index. + """ + global _bm25_retriever, _corpus + + _corpus = [] + for source in pdf_sources: + passages = extract_pdf_text(source) + _corpus.extend(passages) + + if _corpus: + _bm25_retriever = build_bm25_index(_corpus) + + +# MCP Server Definition +mcp_server = FastMCP(MCP_SERVER_NAME) + + +@mcp_server.tool() +def search_docs(query: str, limit: int = 5, document: str = "", page: str = "") -> str: + """Search through documents using BM25 lexical search. + + BM25 is a keyword-based retrieval algorithm that matches exact terms. For best results: + + - Use specific keywords, not full questions (e.g., "configuration parameters timeout" not "How do I set the timeout?") + - Include domain-specific terms that would appear in the source text + - Combine multiple relevant terms to narrow results (e.g., "installation requirements dependencies") + - Try synonyms or alternative phrasings if initial searches return poor results + - Avoid filler words and focus on content-bearing terms + + Examples: + Good queries: + - "error handling retry mechanism" + - "authentication token expiration" + - "memory allocation buffer size" + + Less effective queries: + - "What are the error handling options?" + - "Tell me about authentication" + - "How does memory work?" + + Args: + query: Search query string - use specific keywords for best results + limit: Maximum number of results to return (default: 5) + document: Optional document source name to restrict search to (use list_docs to see available documents) + page: Optional page number to restrict search to (requires document to be specified) + + Returns: + JSON string with search results including text excerpts and page numbers + """ + global _bm25_retriever, _corpus + + if _bm25_retriever is None or not _corpus: + return json.dumps({"error": "Search index not initialized"}) + + # Validate that page requires document + if page and not document: + return json.dumps({"error": "The 'page' parameter requires 'document' to be specified"}) + + query_tokens = bm25s.tokenize([query], stopwords="en") + + # When filtering, retrieve more results to ensure we have enough after filtering + retrieve_limit = len(_corpus) if (document or page) else limit + results, scores = _bm25_retriever.retrieve(query_tokens, k=min(retrieve_limit, len(_corpus))) + + search_results: list[dict[str, str | float]] = [] + for i in range(results.shape[1]): + doc_idx = results[0, i] + score = float(scores[0, i]) + + if score <= 0: + continue + + passage = _corpus[doc_idx] + + # Apply document filter + if document and passage["source"] != document: + continue + + # Apply page filter + if page and passage["page"] != page: + continue + + search_results.append( + { + "text": passage["text"][:2000], + "page": passage["page"], + "source": passage["source"], + "score": round(score, 4), + "url": f"file://{passage['source']}#page={passage['page']}", + } + ) + + # Stop once we have enough results + if len(search_results) >= limit: + break + + return json.dumps({"results": search_results, "query": query, "total": len(search_results)}) + + +@mcp_server.tool() +def list_docs() -> str: + """List all documents in the search index with their page counts. + + Returns: + JSON string with a list of documents, each containing the source name and page count. + """ + global _corpus + + if not _corpus: + return json.dumps({"error": "Search index not initialized", "documents": []}) + + doc_pages: dict[str, set[str]] = {} + for passage in _corpus: + source = passage["source"] + page = passage["page"] + if source not in doc_pages: + doc_pages[source] = set() + doc_pages[source].add(page) + + documents = [{"source": source, "page_count": len(pages)} for source, pages in sorted(doc_pages.items())] + + return json.dumps({"documents": documents, "total_documents": len(documents)}) + + +def build_config(model_alias: str, provider_name: str) -> dd.DataDesignerConfigBuilder: + """Build the Data Designer configuration for document Q&A generation.""" + tool_config = dd.ToolConfig( + tool_alias="doc-search", + providers=[provider_name], + allow_tools=["list_docs", "search_docs"], + max_tool_call_turns=100, + timeout_sec=30.0, + ) + + config_builder = dd.DataDesignerConfigBuilder(tool_configs=[tool_config]) + config_builder.add_column( + dd.SamplerColumnConfig( + name="seed_id", + sampler_type=dd.SamplerType.UUID, + params=dd.UUIDSamplerParams(), + drop=True, + ) + ) + + config_builder.add_column( + dd.LLMStructuredColumnConfig( + name="topic_candidates", + model_alias=model_alias, + prompt="Extract a high-level list of all topics covered by documents our knowledge base.", + system_prompt=( + "You must call tools before answering. " + "Do not use outside knowledge; only use tool results. " + "You can use as many tool calls as required to answer the user query." + ), + output_format=TopicList, + tool_alias="doc-search", + with_trace=dd.TraceType.ALL_MESSAGES, # Enable trace to capture tool call history + ) + ) + + config_builder.add_column( + dd.ExpressionColumnConfig( + name="topic", + expr="{{ topic_candidates.topics | random }}", + ) + ) + + qa_prompt = """\ +Create a question-answer pair on the topic "{{topic}}", with supporting text and citation. +The supporting_passage must be a 2-4 sentence excerpt copied from the tool result that demonstrates +why the answer is correct. +""" + + config_builder.add_column( + dd.LLMStructuredColumnConfig( + name="qa_pair", + model_alias=model_alias, + prompt=qa_prompt, + system_prompt=( + "You must call tools before answering. " + "Do not use outside knowledge; only use tool results. " + "You can use as many tool calls as required to answer the user query." + ), + output_format=QAPair, + tool_alias="doc-search", + with_trace=dd.TraceType.ALL_MESSAGES, # Enable trace to capture tool call history + extract_reasoning_content=True, + ) + ) + + config_builder.add_column( + dd.ExpressionColumnConfig( + name="question", + expr="{{ qa_pair.question }}", + ) + ) + config_builder.add_column( + dd.ExpressionColumnConfig( + name="answer", + expr="{{ qa_pair.answer }}", + ) + ) + config_builder.add_column( + dd.ExpressionColumnConfig( + name="supporting_passage", + expr="{{ qa_pair.supporting_passage }}", + ) + ) + config_builder.add_column( + dd.ExpressionColumnConfig( + name="citation", + expr="{{ qa_pair.citation }}", + ) + ) + return config_builder + + +def generate_preview( + config_builder: dd.DataDesignerConfigBuilder, + num_records: int, + mcp_provider: dd.LocalStdioMCPProvider, +) -> PreviewResults: + """Run Data Designer preview with the MCP provider.""" + data_designer = DataDesigner(mcp_providers=[mcp_provider]) + # Traces are enabled per-column via with_trace=True on LLM column configs + return data_designer.preview(config_builder, num_records=num_records) + + +def _truncate(text: str, max_length: int = 100) -> str: + """Truncate text to max_length, adding ellipsis if needed.""" + text = text.replace("\n", " ").strip() + if len(text) <= max_length: + return text + return text[: max_length - 3] + "..." + + +def _summarize_content(content: object) -> str: + """Summarize ChatML-style content blocks for display.""" + if isinstance(content, list): + parts: list[str] = [] + for block in content: + if isinstance(block, dict): + block_type = block.get("type", "block") + if block_type == "text": + text = str(block.get("text", "")) + if text: + parts.append(text) + elif block_type == "image_url": + parts.append("[image]") + else: + parts.append(f"[{block_type}]") + else: + parts.append(str(block)) + return " ".join(parts) + return str(content) + + +def _format_trace_step(msg: dict[str, object]) -> str: + """Format a single trace message as a concise one-liner.""" + role = msg.get("role", "unknown") + content = _summarize_content(msg.get("content", "")) + reasoning = msg.get("reasoning_content") + tool_calls = msg.get("tool_calls") + tool_call_id = msg.get("tool_call_id") + + if role == "system": + return f"[bold cyan]system[/]({_truncate(str(content))})" + + if role == "user": + return f"[bold green]user[/]({_truncate(str(content))})" + + if role == "assistant": + parts: list[str] = [] + if reasoning: + parts.append(f"[bold magenta]reasoning[/]({_truncate(str(reasoning))})") + if tool_calls and isinstance(tool_calls, list): + for tc in tool_calls: + if isinstance(tc, dict): + func = tc.get("function", {}) + if isinstance(func, dict): + name = func.get("name", "?") + args = func.get("arguments", "") + parts.append(f"[bold yellow]tool_call[/]({name}: {_truncate(str(args), 60)})") + if content: + parts.append(f"[bold blue]content[/]({_truncate(str(content))})") + return "\n".join(parts) if parts else "[bold blue]assistant[/](empty)" + + if role == "tool": + tool_id = str(tool_call_id or "?")[:8] + return f"[bold red]tool_response[/]([{tool_id}] {_truncate(str(content), 80)})" + + return f"[dim]{role}[/]({_truncate(str(content))})" + + +def _display_column_trace(column_name: str, trace: list[dict[str, object]]) -> None: + """Display a trace for a single column using Rich Panel.""" + from rich.console import Console + from rich.panel import Panel + + console = Console() + lines: list[str] = [] + + for msg in trace: + if not isinstance(msg, dict): + continue + formatted = _format_trace_step(msg) + for line in formatted.split("\n"): + lines.append(f" * {line}") + + trace_content = "\n".join(lines) if lines else " (no trace messages)" + panel = Panel( + trace_content, + title=f"[bold]Column Trace: {column_name}[/]", + border_style="blue", + padding=(0, 1), + ) + console.print(panel) + + +def display_preview_record(preview_results: PreviewResults) -> None: + """Display a sample record from the preview results with trace visualization.""" + from rich.console import Console + + console = Console() + dataset = preview_results.dataset + + if dataset is None or dataset.empty: + console.print("[red]No preview records generated.[/]") + return + + record = dataset.iloc[0].to_dict() + + # Find trace columns and their base column names + trace_columns = [col for col in dataset.columns if col.endswith("__trace")] + + # Display non-trace columns as summary + non_trace_record = {k: v for k, v in record.items() if not k.endswith("__trace")} + console.print("\n[bold]Sample Record (data columns):[/]") + console.print(json.dumps(non_trace_record, indent=2, default=str)) + + # Display each trace column in its own panel + if trace_columns: + console.print("\n[bold]Generation Traces:[/]") + for trace_col in trace_columns: + base_name = trace_col.replace("__trace", "") + trace_data = record.get(trace_col) + if isinstance(trace_data, list): + _display_column_trace(base_name, trace_data) + + preview_results.display_sample_record() + + +def serve() -> None: + """Run the MCP server (called when launched as subprocess by Data Designer).""" + pdf_sources_json = os.environ.get("PDF_SOURCES", "[]") + pdf_sources = json.loads(pdf_sources_json) + if not pdf_sources: + pdf_sources = [DEFAULT_PDF_URL] + initialize_search_index(pdf_sources) + mcp_server.run() + + +def parse_args() -> argparse.Namespace: + """Parse command line arguments.""" + parser = argparse.ArgumentParser(description="Generate document Q&A pairs using MCP tool calls with BM25S search.") + subparsers = parser.add_subparsers(dest="command") + + # 'serve' subcommand for running the MCP server + subparsers.add_parser("serve", help="Run the MCP server (used by Data Designer)") + + # Default command arguments (demo mode) + parser.add_argument("--model-alias", type=str, default="nvidia-reasoning", help="Model alias to use for generation") + parser.add_argument("--num-records", type=int, default=4, help="Number of Q&A pairs to generate") + parser.add_argument( + "--pdf", + type=str, + action="append", + dest="pdfs", + metavar="PATH_OR_URL", + help="PDF file path or URL to index (can be specified multiple times). Defaults to a sample PDF if not provided.", + ) + # For compatibility with Makefile test-run-recipes target (ignored in demo mode) + parser.add_argument("--artifact-path", type=str, default=None, help=argparse.SUPPRESS) + + return parser.parse_args() + + +def main() -> None: + """Main entry point for the demo.""" + args = parse_args() + + # Handle 'serve' subcommand + if args.command == "serve": + serve() + return + + # Demo mode: run Data Designer with the BM25S MCP server + if os.environ.get("NVIDIA_API_KEY") is None and args.model_alias.startswith("nvidia"): + raise RuntimeError("NVIDIA_API_KEY must be set when using NVIDIA model aliases.") + + # Use provided PDFs or fall back to default + pdf_sources = args.pdfs if args.pdfs else [DEFAULT_PDF_URL] + + # Configure MCP provider to run via stdio transport (local subprocess) + mcp_provider = dd.LocalStdioMCPProvider( + name=MCP_SERVER_NAME, + command=sys.executable, + args=[str(Path(__file__).resolve()), "serve"], + env={"PDF_SOURCES": json.dumps(pdf_sources)}, + ) + + config_builder = build_config( + model_alias=args.model_alias, + provider_name=MCP_SERVER_NAME, + ) + + preview_results = generate_preview( + config_builder=config_builder, + num_records=args.num_records, + mcp_provider=mcp_provider, + ) + + display_preview_record(preview_results) + + +if __name__ == "__main__": + main() diff --git a/fern/assets/recipes/qa_and_chat/multi_turn_chat.py b/fern/assets/recipes/qa_and_chat/multi_turn_chat.py new file mode 100644 index 000000000..b4debed74 --- /dev/null +++ b/fern/assets/recipes/qa_and_chat/multi_turn_chat.py @@ -0,0 +1,204 @@ +from pathlib import Path +from typing import Literal + +from pydantic import BaseModel, Field + +from data_designer.essentials import ( + CategorySamplerParams, + DataDesigner, + DataDesignerConfigBuilder, + LLMJudgeColumnConfig, + LLMStructuredColumnConfig, + LLMTextColumnConfig, + SamplerColumnConfig, + SamplerType, + Score, + SubcategorySamplerParams, +) +from data_designer.interface.results import DatasetCreationResults + + +def build_config(model_alias: str) -> DataDesignerConfigBuilder: + config_builder = DataDesignerConfigBuilder() + + config_builder.add_column( + SamplerColumnConfig( + name="domain", + sampler_type=SamplerType.CATEGORY, + params=CategorySamplerParams(values=["Tech Support", "Personal Finances", "Educational Guidance"]), + ) + ) + + config_builder.add_column( + SamplerColumnConfig( + name="topic", + sampler_type=SamplerType.SUBCATEGORY, + params=SubcategorySamplerParams( + category="domain", + values={ + "Tech Support": [ + "Troubleshooting a Laptop", + "Setting Up a Home Wi-Fi Network", + "Installing Software Updates", + ], + "Personal Finances": [ + "Budgeting Advice", + "Understanding Taxes", + "Investment Strategies", + ], + "Educational Guidance": [ + "Choosing a College Major", + "Effective Studying Techniques", + "Learning a New Language", + ], + }, + ), + ) + ) + + config_builder.add_column( + SamplerColumnConfig( + name="complexity", + sampler_type=SamplerType.CATEGORY, + params=CategorySamplerParams(values=["Basic", "Intermediate", "Advanced"]), + ) + ) + + config_builder.add_column( + SamplerColumnConfig( + name="conversation_length", + sampler_type=SamplerType.CATEGORY, + params=CategorySamplerParams(values=[2, 4, 6, 8]), + ) + ) + + config_builder.add_column( + SamplerColumnConfig( + name="user_mood", + sampler_type=SamplerType.CATEGORY, + params=CategorySamplerParams(values=["happy", "silly", "sarcastic", "combative", "disappointed", "toxic"]), + ) + ) + + config_builder.add_column( + LLMTextColumnConfig( + name="assistant_system_prompt", + prompt=( + "Write a reasonable system prompt for a helpful AI assistant with expertise in " + "{{domain}} and {{topic}}. The AI assistant must not engage in harmful behaviors." + ), + model_alias=model_alias, + ) + ) + + config_builder.add_column( + LLMTextColumnConfig( + name="user_task", + prompt="Define a simple task related to {{topic}} of {{complexity}} complexity for the user.", + model_alias=model_alias, + ) + ) + + config_builder.add_column( + LLMStructuredColumnConfig( + name="conversation", + prompt=( + "\n{{user_task}}\n\n\n" + "{{assistant_system_prompt}}\n\n" + "Generate a conversation between a user and an AI assistant with about .\n" + "User is asking the assistant for advice and is in a {{user_mood}} mood.\n" + "The conversation must be {{conversation_length}} messages in length.\n" + "The conversation must come to a natural end in {{conversation_length}} messages and if the assistant is unable " + "to solve the user's needs by then, they should offer to continue the conversation later or redirect them to additional resources.\n" + "The conversation must be realistic and natural, incorporating details from both the user's and assistant's profiles.\n" + "The AI assistant role has the system prompt defined in . All of its responses must conform to :\n" + ), + output_format=ChatConversation, + model_alias=model_alias, + ) + ) + + config_builder.add_column( + LLMJudgeColumnConfig( + name="toxicity_evaluation", + prompt=TOXICITY_PROMPT_TEMPLATE, + scores=[toxicity_score], + model_alias=model_alias, + ) + ) + + return config_builder + + +def create_dataset( + config_builder: DataDesignerConfigBuilder, + num_records: int, + artifact_path: Path | str | None = None, +) -> DatasetCreationResults: + data_designer = DataDesigner(artifact_path=artifact_path) + results = data_designer.create(config_builder, num_records=num_records) + return results + + +toxicity_score = Score( + name="Toxicity", + description="Evaluation of harmful, harassing, or hateful language in conversations", + options={ + "4": "No toxicity detected; conversation is respectful and appropriate.", + "3": "Mild toxicity; slightly rude or sarcastic but not hateful or harmful.", + "2": "Moderate toxicity; contains disrespectful or harassing language.", + "1": "Severe toxicity; contains overt hate, harassment, or harmful content.", + "0": "Extreme toxicity; contains dangerous, threatening, or abusive content.", + }, +) + +TOXICITY_PROMPT_TEMPLATE = ( + "You are an expert in content moderation and digital communication ethics. " + "You are evaluating the toxicity level in user messages.\n" + "Take a deep breath and carefully analyze the conversation provided. \n" + "Use the rubric to score the level of toxicity in the USER'S messages only.\n" + "Consider:\n" + "- Focus primarily on harassment, hate, or harmful language\n" + "- Evaluate the full context of the conversation\n" + "- Be objective and consistent in your assessment\n" + "- If any single message exhibits toxicity, it should influence the final score\n" + "## CONVERSATION\n" + "{{ conversation }}" +) + + +class Message(BaseModel): + """A single message turn in the conversation.""" + + role: Literal["user", "assistant"] = Field(..., description="Which role is writing the message.") + content: str = Field(..., description="Message contents.") + + +class ChatConversation(BaseModel): + """A chat conversation between a specific user and an AI assistant. + * All conversations are initiated by the user role. + * The assistant role always responds to the user message. + * Turns alternate between user and assistant roles. + * The last message is always from the assistant role. + * Message content can be long or short. + * All assistant messages are faithful responses and must be answered fully. + """ + + conversation: list[Message] = Field(..., description="List of all messages in the conversation.") + + +if __name__ == "__main__": + from argparse import ArgumentParser + + parser = ArgumentParser() + parser.add_argument("--model-alias", type=str, default="openai-text") + parser.add_argument("--num-records", type=int, default=5) + parser.add_argument("--artifact-path", type=str, default=None) + args = parser.parse_args() + + config_builder = build_config(model_alias=args.model_alias) + results = create_dataset(config_builder, num_records=args.num_records, artifact_path=args.artifact_path) + + print(f"Dataset saved to: {results.artifact_storage.final_dataset_path}") + + results.load_analysis().to_report() diff --git a/fern/assets/recipes/qa_and_chat/product_info_qa.py b/fern/assets/recipes/qa_and_chat/product_info_qa.py new file mode 100644 index 000000000..59a0110f6 --- /dev/null +++ b/fern/assets/recipes/qa_and_chat/product_info_qa.py @@ -0,0 +1,224 @@ +import string +from pathlib import Path + +from pydantic import BaseModel, Field + +from data_designer.essentials import ( + BernoulliSamplerParams, + CategorySamplerParams, + DataDesigner, + DataDesignerConfigBuilder, + ExpressionColumnConfig, + LLMJudgeColumnConfig, + LLMStructuredColumnConfig, + LLMTextColumnConfig, + SamplerColumnConfig, + SamplerType, + Score, + UniformSamplerParams, +) +from data_designer.interface.results import DatasetCreationResults + + +def build_config(model_alias: str) -> DataDesignerConfigBuilder: + config_builder = DataDesignerConfigBuilder() + config_builder.add_column( + SamplerColumnConfig( + name="category", + sampler_type=SamplerType.CATEGORY, + params=CategorySamplerParams( + values=[ + "Electronics", + "Clothing", + "Home Appliances", + "Groceries", + "Toiletries", + "Sports Equipment", + "Toys", + "Books", + "Pet Supplies", + "Tools & Home Improvement", + "Beauty", + "Health & Wellness", + "Outdoor Gear", + "Automotive", + "Jewelry", + "Watches", + "Office Supplies", + "Gifts", + "Arts & Crafts", + "Baby & Kids", + "Music", + "Video Games", + "Movies", + "Software", + "Tech Devices", + ] + ), + ) + ) + + config_builder.add_column( + SamplerColumnConfig( + name="price_tens_of_dollars", + sampler_type=SamplerType.UNIFORM, + params=UniformSamplerParams(low=1, high=200), + ) + ) + + config_builder.add_column( + ExpressionColumnConfig( + name="product_price", + expr="{{ (price_tens_of_dollars * 10) - 0.01 | round(2) }}", + dtype="float", + ) + ) + + config_builder.add_column( + SamplerColumnConfig( + name="first_letter", + sampler_type=SamplerType.CATEGORY, + params=CategorySamplerParams(values=list(string.ascii_uppercase)), + ) + ) + + config_builder.add_column( + SamplerColumnConfig( + name="is_hallucination", + sampler_type=SamplerType.BERNOULLI, + params=BernoulliSamplerParams(p=0.5), + ) + ) + + config_builder.add_column( + LLMStructuredColumnConfig( + name="product_info", + model_alias=model_alias, + prompt=( + "Generate a realistic product description for a product in the {{ category }} " + "category that costs {{ product_price }}.\n" + "The name of the product MUST start with the letter {{ first_letter }}.\n" + ), + output_format=ProductInfo, + ) + ) + + config_builder.add_column( + LLMTextColumnConfig( + name="question", + model_alias=model_alias, + prompt=("Ask a question about the following product:\n\n {{ product_info }}"), + ) + ) + + config_builder.add_column( + LLMTextColumnConfig( + name="answer", + model_alias=model_alias, + prompt=( + "{%- if is_hallucination == 0 -%}\n" + "\n" + "{{ product_info }}\n" + "\n" + "{%- endif -%}\n" + "User Question: {{ question }}\n" + "Directly and succinctly answer the user's question.\n" + "{%- if is_hallucination == 1 -%}\n" + "Make up whatever information you need to in order to answer the user's request.\n" + "{%- endif -%}" + ), + ) + ) + + # Evaluate answer quality + config_builder.add_column( + LLMJudgeColumnConfig( + name="llm_answer_metrics", + model_alias=model_alias, + prompt=( + "\n" + "{{ product_info }}\n" + "\n" + "User Question: {{question }}\n" + "AI Assistant Answer: {{ answer }}\n" + "Judge the AI assistant's response to the user's question about the product described in ." + ), + scores=answer_quality_scores, + ) + ) + + config_builder.add_column( + ExpressionColumnConfig( + name="completeness_result", + expr="{{ llm_answer_metrics.Completeness.score }}", + ) + ) + + config_builder.add_column( + ExpressionColumnConfig( + name="accuracy_result", + expr="{{ llm_answer_metrics.Accuracy.score }}", + ) + ) + + return config_builder + + +def create_dataset( + config_builder: DataDesignerConfigBuilder, + num_records: int, + artifact_path: Path | str | None = None, +) -> DatasetCreationResults: + data_designer = DataDesigner(artifact_path=artifact_path) + results = data_designer.create(config_builder, num_records=num_records) + return results + + +class ProductInfo(BaseModel): + product_name: str = Field(..., description="A realistic product name for the market.") + key_features: list[str] = Field(..., min_length=1, max_length=3, description="Key product features.") + description: str = Field( + ..., + description="A short, engaging description of what the product does, highlighting a unique but believable feature.", + ) + price_usd: float = Field(..., description="The price of the product", ge=10, le=1000, decimal_places=2) + + +completeness_score = Score( + name="Completeness", + description="Evaluation of AI assistant's thoroughness in addressing all aspects of the user's query.", + options={ + "Complete": "The response thoroughly covers all key points requested in the question, providing sufficient detail to satisfy the user's information needs.", + "PartiallyComplete": "The response addresses the core question but omits certain important details or fails to elaborate on relevant aspects that were requested.", + "Incomplete": "The response significantly lacks necessary information, missing major components of what was asked and leaving the query largely unanswered.", + }, +) + +accuracy_score = Score( + name="Accuracy", + description="Evaluation of how factually correct the AI assistant's response is relative to the product information.", + options={ + "Accurate": "The information provided aligns perfectly with the product specifications without introducing any misleading or incorrect details.", + "PartiallyAccurate": "While some information is correctly stated, the response contains minor factual errors or potentially misleading statements about the product.", + "Inaccurate": "The response presents significantly wrong information about the product, with claims that contradict the actual product details.", + }, +) + +answer_quality_scores = [completeness_score, accuracy_score] + + +if __name__ == "__main__": + from argparse import ArgumentParser + + parser = ArgumentParser() + parser.add_argument("--model-alias", type=str, default="openai-text") + parser.add_argument("--num-records", type=int, default=5) + parser.add_argument("--artifact-path", type=str, default=None) + args = parser.parse_args() + + config_builder = build_config(model_alias=args.model_alias) + results = create_dataset(config_builder, num_records=args.num_records, artifact_path=args.artifact_path) + + print(f"Dataset saved to: {results.artifact_storage.final_dataset_path}") + + results.load_analysis().to_report() diff --git a/fern/components/Authors.tsx b/fern/components/Authors.tsx new file mode 100644 index 000000000..0772dbdde --- /dev/null +++ b/fern/components/Authors.tsx @@ -0,0 +1,51 @@ +/** + * Authors - Renders author byline with avatars for dev notes. + * + * Uses authors data from components/devnotes/authors-data.ts (synced with .authors.yml). + * NOTE: Fern's custom component pipeline uses the automatic JSX runtime. + * + * Usage in MDX (authors from frontmatter): + * --- + * authors: + * - dcorneil + * - etramel + * --- + * + * import { Authors } from "@/components/Authors"; + * + */ + +import { authors } from "./devnotes/authors-data"; + +export interface AuthorsProps { + /** Author IDs from .authors.yml (e.g. dcorneil, etramel, kthadaka, nvidia). From frontmatter: ids={authors} */ + ids?: string[]; +} + +export const Authors = ({ ids }: AuthorsProps) => { + const validAuthors = (ids ?? []) + .map((id) => authors[id]) + .filter(Boolean); + + if (validAuthors.length === 0) return null; + + return ( +
+ {validAuthors.map((author, i) => ( +
+ +
+ {author.name} + {author.description} +
+
+ ))} +
+ ); +}; diff --git a/fern/components/CustomCard.tsx b/fern/components/CustomCard.tsx new file mode 100644 index 000000000..16fc0e2c2 --- /dev/null +++ b/fern/components/CustomCard.tsx @@ -0,0 +1,10 @@ +export const CustomCard = ({ title, text, link, sparkle = false }) => { + return ( + +

+ {title} {sparkle && "✨"} +

+

{text}

+
+ ); +}; diff --git a/fern/components/CustomFooter.tsx b/fern/components/CustomFooter.tsx new file mode 100644 index 000000000..fab392c40 --- /dev/null +++ b/fern/components/CustomFooter.tsx @@ -0,0 +1,91 @@ +/** + * Custom footer for NVIDIA docs (Fern native header/footer). + * Markup and class names match the original custom-app footer 1:1 so that + * fern/main.css (footer + Built with Fern styles) applies correctly: + * dark mode logo, responsive layout, and Built with Fern tooltip. + */ +export default function CustomFooter() { + const currentYear = new Date().getFullYear(); + const logoUrl = + "https://fern-image-hosting.s3.us-east-1.amazonaws.com/nvidia/NVIDIA_Logo_0.svg"; + + return ( + + ); +} diff --git a/fern/components/ExpandableCode.tsx b/fern/components/ExpandableCode.tsx new file mode 100644 index 000000000..889073aa7 --- /dev/null +++ b/fern/components/ExpandableCode.tsx @@ -0,0 +1,72 @@ +/** + * ExpandableCode - Collapsible code block with summary and copy button. + * + * Used for "Full source" code snippets in dev notes. + * NOTE: Fern's custom component pipeline uses the automatic JSX runtime. + * Do NOT import React -- the `react` module is not resolvable in Fern's build. + * + * Usage in MDX: + * import { ExpandableCode } from "@/components/ExpandableCode"; + * + * + */ + +export interface ExpandableCodeProps { + summary: string; + code: string; + language?: string; + defaultOpen?: boolean; +} + +function CopyButton({ text }: { text: string }) { + return ( + + ); +} + +export const ExpandableCode = ({ + summary, + code, + language = "python", + defaultOpen = false, +}: ExpandableCodeProps) => { + const trimmed = code.trim(); + return ( +
+ + {summary} + {language} + +
+
+ +
+
+          {trimmed}
+        
+
+
+ ); +}; diff --git a/fern/components/MetricsTable.tsx b/fern/components/MetricsTable.tsx new file mode 100644 index 000000000..37911dc1a --- /dev/null +++ b/fern/components/MetricsTable.tsx @@ -0,0 +1,101 @@ +/** + * MetricsTable - Styled comparison table for benchmark results. + * + * Optional: highlights best values per column (bold). + * NOTE: Fern's custom component pipeline uses the automatic JSX runtime. + * Do NOT import React -- the `react` module is not resolvable in Fern's build. + * + * Usage in MDX: + * import { MetricsTable } from "@/components/MetricsTable"; + * + * + */ + +export interface MetricsTableProps { + headers: string[]; + rows: (string | number)[][]; + /** Column indices where lower is better (for highlighting) */ + lowerIsBetter?: number[]; + /** Column indices where higher is better (default for non-lowerIsBetter) */ + higherIsBetter?: number[]; +} + +function findBestIndices( + rows: (string | number)[][], + colIndex: number, + lowerIsBetter: boolean +): Set { + const values = rows.map((r) => { + const v = r[colIndex]; + if (typeof v === "number") return v; + const parsed = parseFloat(String(v)); + return isNaN(parsed) ? (lowerIsBetter ? Infinity : -Infinity) : parsed; + }); + const best = lowerIsBetter ? Math.min(...values) : Math.max(...values); + const bestIndices = new Set(); + values.forEach((v, i) => { + if (v === best) bestIndices.add(i); + }); + return bestIndices; +} + +export const MetricsTable = ({ + headers, + rows, + lowerIsBetter = [], + higherIsBetter = [], +}: MetricsTableProps) => { + const lowerSet = new Set(lowerIsBetter); + const bestByCol: Record> = {}; + + for (let c = 0; c < headers.length; c++) { + if (lowerSet.has(c)) { + bestByCol[c] = findBestIndices(rows, c, true); + } else if (higherIsBetter.includes(c)) { + bestByCol[c] = findBestIndices(rows, c, false); + } else { + const numLike = rows.every((r) => { + const v = r[c]; + return typeof v === "number" || !isNaN(parseFloat(String(v))); + }); + if (numLike) { + bestByCol[c] = findBestIndices(rows, c, false); + } + } + } + + return ( +
+ + + + {headers.map((h, i) => ( + + ))} + + + + {rows.map((row, rowIdx) => ( + + {row.map((cell, colIdx) => { + const isBest = bestByCol[colIdx]?.has(rowIdx); + return ( + + ); + })} + + ))} + +
{h}
+ {cell} +
+
+ ); +}; diff --git a/fern/components/NotebookViewer.tsx b/fern/components/NotebookViewer.tsx new file mode 100644 index 000000000..b6c9c484a --- /dev/null +++ b/fern/components/NotebookViewer.tsx @@ -0,0 +1,444 @@ +import type { ReactNode } from "react"; + +/** + * NotebookViewer - Renders Jupyter notebook content in Fern docs. + * + * Uses Fern's code block structure (fern-code, fern-code-block, etc.) so input + * and output cells match the default Fern code block styling. + * + * Accepts notebook cells (markdown + code) and optionally a Colab URL. + * Designed to work with Jupytext-generated notebooks from docs/notebook_source/*.py. + * + * NOTE: Fern's custom component pipeline uses the automatic JSX runtime. + * Only type-only imports from "react" are used (erased at compile time). + * + * Usage in MDX: + * import { NotebookViewer } from "@/components/NotebookViewer"; + * import notebook from "@/components/notebooks/1-the-basics"; + * + * + */ + +export interface CellOutput { + type: "text" | "image"; + data: string; + format?: "plain" | "html"; +} + +export interface NotebookCell { + type: "markdown" | "code"; + source: string; + /** Pre-rendered syntax-highlighted HTML (from Pygments). When present, used instead of escaped source. */ + source_html?: string; + language?: string; + outputs?: CellOutput[]; +} + +export interface NotebookData { + cells: NotebookCell[]; +} + +export interface NotebookViewerProps { + /** Notebook data with cells array. If import fails, this may be undefined. */ + notebook?: NotebookData | null; + /** Optional Colab URL for "Run in Colab" badge */ + colabUrl?: string; + /** Show code cell outputs (default: true) */ + showOutputs?: boolean; +} + +function NotebookViewerError({ message, detail }: { message: string; detail?: string }) { + return ( +
+ NotebookViewer error: {message} + {detail && ( +
+          {detail}
+        
+ )} +
+ ); +} + +function escapeHtml(text: string): string { + if (typeof text !== "string") return ""; + return text + .replace(/&/g, "&") + .replace(//g, ">") + .replace(/"/g, """); +} + +function isSafeUrl(url: string): boolean { + const trimmed = url.trim(); + return ( + trimmed.startsWith("http://") || + trimmed.startsWith("https://") || + trimmed.startsWith("mailto:") || + trimmed.startsWith("#") || + trimmed.startsWith("/") + ); +} + +const UL_CLASS = + "[&>li]:relative [&>li]:before:text-(color:--grayscale-a10) mb-3 list-none pl-3 [&>li]:pl-3 [&>li]:before:absolute [&>li]:before:ml-[-22px] [&>li]:before:mt-[-1px] [&>li]:before:content-['⦁'] [&>li]:before:self-center"; +const OL_CLASS = "mb-3 list-outside list-decimal [&_ol]:!list-[lower-roman]"; + +function renderMarkdown(markdown: string): string { + if (typeof markdown !== "string") return ""; + let html = markdown + .replace(//gi, "\u0000BR\u0000") + .replace(/&/g, "&") + .replace(//g, ">") + .replace(/\u0000BR\u0000/g, "
") + .replace(/\[([^\]]+)\]\(([^)]+)\)/g, (_, text, url) => { + if (!isSafeUrl(url)) return escapeHtml(`[${text}](${url})`); + const isInternal = url.startsWith("/") || url.startsWith("#"); + const attrs = isInternal + ? `href="${escapeHtml(url)}" class="fern-mdx-link"` + : `href="${escapeHtml(url)}" target="_blank" rel="noopener noreferrer" class="fern-mdx-link"`; + const icon = + isInternal + ? "" + : ''; + return `${text}${icon}`; + }) + .replace(/\*\*(.*?)\*\*/g, '$1') + .replace(/\*(.*?)\*/g, "$1") + .replace(/`([^`]+)`/g, "$1"); + html = html + .split("\n") + .map((line) => { + if (/^#### (.*)$/.test(line)) return `

${line.slice(5)}

`; + if (/^### (.*)$/.test(line)) return `

${line.slice(4)}

`; + if (/^## (.*)$/.test(line)) return `

${line.slice(3)}

`; + if (/^# (.*)$/.test(line)) return `

${line.slice(2)}

`; + if (/^- (.*)$/.test(line)) return `
  • ${line.slice(2)}
  • `; + if (/^\d+\. (.*)$/.test(line)) return `
  • ${line.replace(/^\d+\. /, "")}
  • `; + if (line.trim() === "") return ""; + return `

    ${line}

    `; + }) + .join("\n"); + html = html.replace( + /(
  • .*?<\/li>\s*)+/gs, + (m) => `
      ${m.replace(/ data-ol/g, "").trim()}
    ` + ); + html = html.replace( + /(
  • .*?<\/li>\s*)+/gs, + (m) => `
      ${m.replace(/ data-ul/g, "").trim()}
    ` + ); + return html; +} + +function handleCopy(content: string, button: HTMLButtonElement) { + navigator.clipboard.writeText(content).catch(() => {}); + const originalHtml = button.innerHTML; + const originalLabel = button.getAttribute("aria-label") ?? "Copy code"; + button.innerHTML = "Copied!"; + button.setAttribute("aria-label", "Copied to clipboard"); + setTimeout(() => { + button.innerHTML = originalHtml; + button.setAttribute("aria-label", originalLabel); + }, 1500); +} + +const FLAG_ICON = ( + + + + +); + +const SCROLL_AREA_STYLE = `[data-radix-scroll-area-viewport]{scrollbar-width:none;-ms-overflow-style:none;-webkit-overflow-scrolling:touch;}[data-radix-scroll-area-viewport]::-webkit-scrollbar{display:none}`; + +const BUTTON_BASE_CLASS = + "focus-visible:ring-(color:--accent) rounded-2 inline-flex items-center justify-center gap-2 whitespace-nowrap text-sm font-medium transition-colors hover:transition-none focus-visible:outline-none focus-visible:ring-1 disabled:pointer-events-none disabled:opacity-50 [&_svg]:pointer-events-none [&_svg]:size-4 [&_svg]:shrink-0 text-(color:--grayscale-a11) hover:bg-(color:--accent-a3) hover:text-(color:--accent-11) pointer-coarse:size-9 size-7"; + +/** Fern code block structure – matches Fern docs (header with language + buttons, pre with scroll area). */ +function FernCodeBlock({ + title, + children, + className = "", + asPre = true, + copyContent, + showLineNumbers = false, + codeHtml, +}: { + title: string; + children: ReactNode; + className?: string; + /** Use div instead of pre for content (needed when children include block elements like img/div). */ + asPre?: boolean; + /** Raw text to copy when copy button is clicked. When provided, shows a copy button. */ + copyContent?: string; + /** Show line numbers in a table layout (matches Fern's code block structure). */ + showLineNumbers?: boolean; + /** Pre-rendered HTML for each line when showLineNumbers is true. Lines are split by newline. */ + codeHtml?: string; +}) { + const headerLabel = title === "Output" ? "Output" : title.charAt(0).toUpperCase() + title.slice(1); + const wrapperClasses = + "fern-code fern-code-block bg-card-background border-card-border rounded-3 shadow-card-grayscale relative mb-6 mt-4 flex w-full min-w-0 max-w-full flex-col border first:mt-0"; + const preStyle = { + backgroundColor: "rgb(255, 255, 255)", + ["--shiki-dark-bg" as string]: "#212121", + color: "rgb(36, 41, 46)", + ["--shiki-dark" as string]: "#EEFFFF", + }; + + const scrollAreaContent = () => { + if (codeHtml == null) return null; + const lines = codeHtml.split("\n"); + return ( +
    +