diff --git a/.github/codex/prompts/analyze-release.md b/.github/codex/prompts/analyze-release.md index aab8f11..bbb811f 100644 --- a/.github/codex/prompts/analyze-release.md +++ b/.github/codex/prompts/analyze-release.md @@ -186,25 +186,25 @@ Return a JSON object matching the output schema with: } }, { - "name": "vectoriadb", + "name": "enclave-vm", "bump": "patch", - "newVersion": "1.0.1", - "reason": "Fixed memory leak in vector storage", + "newVersion": "2.0.1", + "reason": "Fixed memory leak in sandbox cleanup", "changelog": { "added": [], "changed": [], "deprecated": [], "removed": [], - "fixed": ["Fixed memory leak when deleting vectors from namespace"], + "fixed": ["Fixed memory leak when disposing sandbox instances"], "security": [] } } ], "globalChangelog": { - "summary": "Minor updates to ast-guard and vectoriadb", + "summary": "Minor updates to ast-guard and enclave-vm", "projects": [ { "name": "ast-guard", "version": "1.1.0", "summary": "Added sanitizeHtml function" }, - { "name": "vectoriadb", "version": "1.0.1", "summary": "Fixed memory leak" } + { "name": "enclave-vm", "version": "2.0.1", "summary": "Fixed memory leak" } ] }, "docs": { diff --git a/.github/workflows/cherry-pick-prompt.yml b/.github/workflows/cherry-pick-prompt.yml new file mode 100644 index 0000000..5ff2796 --- /dev/null +++ b/.github/workflows/cherry-pick-prompt.yml @@ -0,0 +1,288 @@ +name: Cherry-pick to main + +on: + pull_request: + types: [closed] + +permissions: + contents: write + pull-requests: write + issues: write + +concurrency: + group: enclave-cherry-pick-${{ github.event.pull_request.number }} + cancel-in-progress: false + +jobs: + cherry-pick: + # Only run when: + # - PR was merged (not just closed) + # - PR is not from a fork (forked PRs have read-only GITHUB_TOKEN) + # - Target branch is a release branch (not main) + if: > + github.event.pull_request.merged == true && + github.event.pull_request.head.repo.fork == false && + github.event.pull_request.base.ref != github.event.repository.default_branch && + (startsWith(github.event.pull_request.base.ref, 'release/') || + startsWith(github.event.pull_request.base.ref, 'next/')) + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + token: ${{ secrets.GITHUB_TOKEN }} + + - name: Configure git user + run: | + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + + - name: Prepare cherry-pick branch + id: prepare + shell: bash + env: + ORIGINAL_BRANCH: ${{ github.event.pull_request.head.ref }} + RELEASE_BRANCH: ${{ github.event.pull_request.base.ref }} + MERGE_SHA: ${{ github.event.pull_request.merge_commit_sha }} + PR_NUMBER: ${{ github.event.pull_request.number }} + PR_TITLE: ${{ github.event.pull_request.title }} + run: | + set -euo pipefail + + # Create cherry-pick branch name + # Sanitize branch name (remove invalid characters, truncate if needed) + CHERRY_BRANCH="${ORIGINAL_BRANCH}-cherry-pick-main" + CHERRY_BRANCH=$(echo "$CHERRY_BRANCH" | sed 's/[^a-zA-Z0-9._/-]/-/g' | head -c 100) + + # Validate branch name is valid for git + if ! git check-ref-format --branch "$CHERRY_BRANCH" >/dev/null 2>&1; then + echo "::error::Invalid branch name after sanitization: $CHERRY_BRANCH" + exit 1 + fi + + echo "cherry_branch=$CHERRY_BRANCH" >> "$GITHUB_OUTPUT" + echo "Cherry-pick branch: $CHERRY_BRANCH" + + # Get default branch + DEFAULT_BRANCH="${{ github.event.repository.default_branch }}" + + # Checkout default branch (main) + git fetch origin "$DEFAULT_BRANCH" + git checkout "$DEFAULT_BRANCH" + git pull origin "$DEFAULT_BRANCH" + + # Create the cherry-pick branch (delete if already exists) + if git show-ref --verify --quiet "refs/heads/$CHERRY_BRANCH"; then + echo "Branch $CHERRY_BRANCH already exists locally, deleting..." + git branch -D "$CHERRY_BRANCH" + fi + if git ls-remote --heads origin "$CHERRY_BRANCH" | grep -q .; then + echo "Branch $CHERRY_BRANCH already exists on remote, deleting..." + git push origin --delete "$CHERRY_BRANCH" || true + fi + git checkout -b "$CHERRY_BRANCH" + + # Attempt cherry-pick + echo "Attempting cherry-pick of $MERGE_SHA..." + + # Detect if merge commit (has multiple parents) + PARENT_COUNT=$(git rev-list --parents -n1 "$MERGE_SHA" | wc -w) + PARENT_COUNT=$((PARENT_COUNT - 1)) # Subtract the commit itself + + CHERRY_PICK_ARGS="--no-commit" + if [ "$PARENT_COUNT" -gt 1 ]; then + echo "Detected merge commit with $PARENT_COUNT parents, using -m 1" + CHERRY_PICK_ARGS="-m 1 --no-commit" + fi + + if git cherry-pick $CHERRY_PICK_ARGS "$MERGE_SHA"; then + echo "conflict=false" >> "$GITHUB_OUTPUT" + + # Create commit with proper attribution + git commit -m "$(cat < + EOF + )" + echo "Cherry-pick successful" + else + echo "conflict=true" >> "$GITHUB_OUTPUT" + git cherry-pick --abort || true + echo "Cherry-pick failed due to conflicts" + fi + + - name: Ensure labels exist + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + gh label create "cherry-pick" --color "0E8A16" --description "Cherry-picked changes" 2>/dev/null || true + gh label create "auto-cherry-pick" --color "1D76DB" --description "Auto cherry-pick" 2>/dev/null || true + gh label create "conflict" --color "D93F0B" --description "Has conflicts" 2>/dev/null || true + gh label create "needs-attention" --color "FBCA04" --description "Needs attention" 2>/dev/null || true + + - name: Push branch and create PR + if: steps.prepare.outputs.conflict == 'false' + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + CHERRY_BRANCH: ${{ steps.prepare.outputs.cherry_branch }} + RELEASE_BRANCH: ${{ github.event.pull_request.base.ref }} + ORIGINAL_PR: ${{ github.event.pull_request.number }} + PR_TITLE: ${{ github.event.pull_request.title }} + PR_AUTHOR: ${{ github.event.pull_request.user.login }} + DEFAULT_BRANCH: ${{ github.event.repository.default_branch }} + run: | + set -euo pipefail + + # Push the cherry-pick branch + git push origin "$CHERRY_BRANCH" + + # Create the PR + PR_URL=$(gh pr create \ + --base "$DEFAULT_BRANCH" \ + --head "$CHERRY_BRANCH" \ + --title "Cherry-pick: $PR_TITLE" \ + --label "cherry-pick" \ + --label "auto-cherry-pick" \ + --body "$(cat <> "$GITHUB_OUTPUT" + + - name: Comment on original PR (success) + if: steps.prepare.outputs.conflict == 'false' + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + ORIGINAL_PR: ${{ github.event.pull_request.number }} + CHERRY_BRANCH: ${{ steps.prepare.outputs.cherry_branch }} + DEFAULT_BRANCH: ${{ github.event.repository.default_branch }} + run: | + gh pr comment "$ORIGINAL_PR" --body "$(cat <> "$GITHUB_OUTPUT" + + - name: Attempt to assign issue + if: steps.prepare.outputs.conflict == 'true' && steps.create_issue.outputs.issue_url != '' + continue-on-error: true + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + PR_AUTHOR: ${{ github.event.pull_request.user.login }} + ISSUE_URL: ${{ steps.create_issue.outputs.issue_url }} + run: | + # Extract issue number from URL + ISSUE_NUMBER=$(echo "$ISSUE_URL" | grep -oE '[0-9]+$') + gh issue edit "$ISSUE_NUMBER" --add-assignee "$PR_AUTHOR" || echo "Could not assign to $PR_AUTHOR, skipping assignment" + + - name: Comment on original PR (conflict) + if: steps.prepare.outputs.conflict == 'true' + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + ORIGINAL_PR: ${{ github.event.pull_request.number }} + DEFAULT_BRANCH: ${{ github.event.repository.default_branch }} + run: | + gh pr comment "$ORIGINAL_PR" --body "$(cat <&2 @@ -45,7 +58,7 @@ jobs: git config user.name "github-actions[bot]" git config user.email "github-actions[bot]@users.noreply.github.com" - - name: Find affected projects with per-package version detection + - name: Find affected projects with unified versioning for core libs id: affected shell: bash run: | @@ -53,6 +66,10 @@ jobs: FIRST_COMMIT=$(git rev-list --max-parents=0 HEAD) + # Core libs use unified versioning - if any has changes, all are released together + CORE_LIBS="${{ env.CORE_LIBS }}" + echo "Core libs (unified versioning): $CORE_LIBS" + # Get all publishable libs ALL_LIBS=$(node -e " const { execSync } = require('child_process'); @@ -68,24 +85,26 @@ jobs: if [ -z "$ALL_LIBS" ]; then echo "projects=" >> "$GITHUB_OUTPUT" echo "project_versions=" >> "$GITHUB_OUTPUT" + echo "unified_version=" >> "$GITHUB_OUTPUT" echo "No publishable libraries found" exit 0 fi echo "All publishable libs: $ALL_LIBS" - # For each lib, find its last release tag and check if affected + # Track versions and changes for all libs + CORE_HAS_CHANGES=false + CORE_HIGHEST_VERSION="0.0.0" AFFECTED_LIBS="" PROJECT_VERSIONS="" IFS=',' read -ra LIBS <<< "$ALL_LIBS" for lib in "${LIBS[@]}"; do - # Find the last release tag for this specific package (e.g., enclave-vm@2.0.0) + # Find the last release tag for this specific package LAST_TAG=$(git tag --list "${lib}@*" --sort=-version:refname | head -n1 || echo "") if [ -n "$LAST_TAG" ]; then BASE_REF="$LAST_TAG" - # Extract version from tag (e.g., enclave-vm@2.0.0 -> 2.0.0) LAST_VERSION="${LAST_TAG#*@}" echo " $lib: last release tag=$LAST_TAG (v$LAST_VERSION)" else @@ -94,30 +113,73 @@ jobs: echo " $lib: no release tag found, using first commit (first release)" fi + # Check if this is a core lib + IS_CORE=false + IFS=',' read -ra CORE_ARR <<< "$CORE_LIBS" + for core in "${CORE_ARR[@]}"; do + if [ "$lib" = "$core" ]; then + IS_CORE=true + # Track highest version among core libs for unified versioning + if [ "$(printf '%s\n%s' "$LAST_VERSION" "$CORE_HIGHEST_VERSION" | sort -V | tail -n1)" = "$LAST_VERSION" ]; then + CORE_HIGHEST_VERSION="$LAST_VERSION" + fi + break + fi + done + # Check if this lib has changes since its last release CHANGES=$(git diff --name-only "$BASE_REF"...HEAD -- "libs/$lib/" 2>/dev/null | head -1 || echo "") if [ -n "$CHANGES" ]; then echo " $lib: has changes since $BASE_REF" - if [ -n "$AFFECTED_LIBS" ]; then - AFFECTED_LIBS="${AFFECTED_LIBS},$lib" - PROJECT_VERSIONS="${PROJECT_VERSIONS},$lib=$LAST_VERSION" + if [ "$IS_CORE" = true ]; then + CORE_HAS_CHANGES=true else - AFFECTED_LIBS="$lib" - PROJECT_VERSIONS="$lib=$LAST_VERSION" + # Non-core libs are tracked independently + if [ -n "$AFFECTED_LIBS" ]; then + AFFECTED_LIBS="${AFFECTED_LIBS},$lib" + PROJECT_VERSIONS="${PROJECT_VERSIONS},$lib=$LAST_VERSION" + else + AFFECTED_LIBS="$lib" + PROJECT_VERSIONS="$lib=$LAST_VERSION" + fi fi else echo " $lib: no changes since $BASE_REF" fi done + # If any core lib has changes, include ALL core libs with unified version + if [ "$CORE_HAS_CHANGES" = true ]; then + echo "" + echo "Core libs have changes - including all core libs with unified versioning" + echo "Unified version base: $CORE_HIGHEST_VERSION" + + # Add all core libs to affected list + IFS=',' read -ra CORE_ARR <<< "$CORE_LIBS" + for core in "${CORE_ARR[@]}"; do + if [ -n "$AFFECTED_LIBS" ]; then + AFFECTED_LIBS="${AFFECTED_LIBS},$core" + PROJECT_VERSIONS="${PROJECT_VERSIONS},$core=$CORE_HIGHEST_VERSION" + else + AFFECTED_LIBS="$core" + PROJECT_VERSIONS="$core=$CORE_HIGHEST_VERSION" + fi + done + fi + echo "projects=$AFFECTED_LIBS" >> "$GITHUB_OUTPUT" echo "project_versions=$PROJECT_VERSIONS" >> "$GITHUB_OUTPUT" + echo "unified_version=$CORE_HIGHEST_VERSION" >> "$GITHUB_OUTPUT" + echo "core_has_changes=$CORE_HAS_CHANGES" >> "$GITHUB_OUTPUT" if [ -n "$AFFECTED_LIBS" ]; then echo "" - echo "Affected publishable libs: $AFFECTED_LIBS" - echo "Last released versions: $PROJECT_VERSIONS" + echo "Projects to release: $AFFECTED_LIBS" + echo "Version info: $PROJECT_VERSIONS" + if [ "$CORE_HAS_CHANGES" = true ]; then + echo "Core libs unified version: $CORE_HIGHEST_VERSION" + fi else echo "No affected publishable libraries" fi @@ -130,8 +192,10 @@ jobs: # ======================================== # MCP Integration: Configure Codex home with MCP servers + # Only needed for auto mode (Codex) # ======================================== - name: Prepare Codex home with MCP config + if: github.event.inputs.bump_type == 'auto' || github.event.inputs.bump_type == '' id: mcp shell: bash run: | @@ -161,6 +225,7 @@ jobs: RUNNER_TEMP: ${{ runner.temp }} - name: Prepare diff context for Codex + if: github.event.inputs.bump_type == 'auto' || github.event.inputs.bump_type == '' id: ctx shell: bash run: | @@ -223,12 +288,106 @@ jobs: # ISO date date -u +"%Y-%m-%d" > .github/codex/prompts/date.txt + # ======================================== + # Manual version bump (when bump_type != 'auto') + # Generates mock Codex output with unified versioning for core libs + # ======================================== + - name: Generate manual version bumps + if: steps.affected.outputs.projects != '' && github.event.inputs.bump_type != 'auto' && github.event.inputs.bump_type != '' + id: manual_bump + shell: bash + run: | + set -euo pipefail + + BUMP_TYPE="${{ github.event.inputs.bump_type }}" + PROJECTS="${{ steps.affected.outputs.projects }}" + UNIFIED_VERSION="${{ steps.affected.outputs.unified_version }}" + CORE_LIBS="${{ env.CORE_LIBS }}" + + echo "Manual bump type: $BUMP_TYPE" + echo "Projects to release: $PROJECTS" + echo "Core libs: $CORE_LIBS" + echo "Current unified version: $UNIFIED_VERSION" + + # Parse version components from unified version + IFS='.' read -r MAJOR MINOR PATCH <<< "$UNIFIED_VERSION" + + # Calculate new version based on bump type + case "$BUMP_TYPE" in + major) + NEW_VERSION="$((MAJOR + 1)).0.0" + ;; + minor) + NEW_VERSION="${MAJOR}.$((MINOR + 1)).0" + ;; + patch) + NEW_VERSION="${MAJOR}.${MINOR}.$((PATCH + 1))" + ;; + esac + + echo "New unified version: $NEW_VERSION" + echo "new_version=$NEW_VERSION" >> "$GITHUB_OUTPUT" + + # Generate mock Codex output JSON for nx-release.mjs + mkdir -p .codex-release + + # Build projects array - core libs get unified version + PROJECTS_JSON="[" + FIRST=true + IFS=',' read -ra LIBS <<< "$PROJECTS" + for lib in "${LIBS[@]}"; do + # Check if this is a core lib + IS_CORE=false + IFS=',' read -ra CORE_ARR <<< "$CORE_LIBS" + for core in "${CORE_ARR[@]}"; do + if [ "$lib" = "$core" ]; then + IS_CORE=true + break + fi + done + + # Core libs use unified version, others use their own + if [ "$IS_CORE" = true ]; then + LIB_VERSION="$NEW_VERSION" + else + LIB_VERSION="$NEW_VERSION" # For simplicity, use same version for all in manual mode + fi + + if [ "$FIRST" = true ]; then + FIRST=false + else + PROJECTS_JSON+="," + fi + PROJECTS_JSON+="{\"name\":\"$lib\",\"bump\":\"$BUMP_TYPE\",\"newVersion\":\"$LIB_VERSION\",\"reason\":\"Manual $BUMP_TYPE bump\",\"changelog\":{\"added\":[],\"changed\":[],\"deprecated\":[],\"removed\":[],\"fixed\":[],\"security\":[]}}" + done + PROJECTS_JSON+="]" + + # Write mock Codex output + cat > "${{ env.CODEX_OUTPUT }}" << EOF + { + "projects": $PROJECTS_JSON, + "globalChangelog": { + "summary": "Manual $BUMP_TYPE release", + "projects": [] + }, + "docs": { + "updated": false, + "files": [], + "summary": "" + } + } + EOF + + echo "Generated manual Codex output:" + cat "${{ env.CODEX_OUTPUT }}" + # ======================================== # Single Codex call for version analysis + docs update # (Cannot run sequential Codex actions due to sudo restrictions) + # Only runs in 'auto' mode # ======================================== - name: Run Codex for release analysis - if: steps.affected.outputs.projects != '' + if: steps.affected.outputs.projects != '' && (github.event.inputs.bump_type == 'auto' || github.event.inputs.bump_type == '') id: codex uses: openai/codex-action@v1 with: @@ -409,7 +568,16 @@ jobs: echo "## Release Summary" >> $GITHUB_STEP_SUMMARY echo "" >> $GITHUB_STEP_SUMMARY echo "**Branch:** ${{ steps.branch.outputs.branch_name }}" >> $GITHUB_STEP_SUMMARY - echo "**Max Version:** ${{ steps.versions.outputs.max_version }}" >> $GITHUB_STEP_SUMMARY + echo "**Version:** ${{ steps.versions.outputs.max_version }}" >> $GITHUB_STEP_SUMMARY echo "**Projects:** ${{ steps.versions.outputs.bumped_projects }}" >> $GITHUB_STEP_SUMMARY - echo "**Last Released Versions:** ${{ steps.affected.outputs.project_versions }}" >> $GITHUB_STEP_SUMMARY - echo "**Analysis:** Codex AI (gpt-5.1-codex)" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "### Versioning Strategy" >> $GITHUB_STEP_SUMMARY + echo "**Core libs (unified):** ${{ env.CORE_LIBS }}" >> $GITHUB_STEP_SUMMARY + echo "**Previous unified version:** ${{ steps.affected.outputs.unified_version }}" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + BUMP_TYPE="${{ github.event.inputs.bump_type }}" + if [ "$BUMP_TYPE" = "auto" ] || [ -z "$BUMP_TYPE" ]; then + echo "**Analysis:** Codex AI (gpt-5.1-codex)" >> $GITHUB_STEP_SUMMARY + else + echo "**Analysis:** Manual ($BUMP_TYPE bump)" >> $GITHUB_STEP_SUMMARY + fi diff --git a/.github/workflows/publish-on-next-close.yml b/.github/workflows/publish-on-next-close.yml index 5a32072..ea32b1b 100644 --- a/.github/workflows/publish-on-next-close.yml +++ b/.github/workflows/publish-on-next-close.yml @@ -185,6 +185,8 @@ jobs: - name: Publish to npm via Nx Release if: steps.to_publish.outputs.projects != '' shell: bash + env: + NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }} run: | set -euo pipefail echo "Publishing selected projects via Nx Release..." diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index 4fe43c5..23ebd8f 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -24,7 +24,7 @@ jobs: fetch-depth: 0 - name: Setup Node - uses: actions/setup-node@v4 + uses: actions/setup-node@v6 with: node-version-file: ".nvmrc" cache: "yarn" @@ -77,7 +77,7 @@ jobs: fetch-depth: 0 - name: Setup Node - uses: actions/setup-node@v4 + uses: actions/setup-node@v6 with: node-version-file: ".nvmrc" cache: "yarn" @@ -127,7 +127,7 @@ jobs: fetch-depth: 0 - name: Setup Node - uses: actions/setup-node@v4 + uses: actions/setup-node@v6 with: node-version-file: ".nvmrc" cache: "yarn" diff --git a/.github/workflows/trigger-docs-update.yml b/.github/workflows/trigger-docs-update.yml new file mode 100644 index 0000000..34df6fe --- /dev/null +++ b/.github/workflows/trigger-docs-update.yml @@ -0,0 +1,184 @@ +# ============================================================================= +# Trigger Docs Update Workflow +# ============================================================================= +# +# Sends a repository_dispatch event to the agentfront-docs repo when: +# 1. A GitHub Release is published (for stable releases) +# 2. A PR is merged to a release/* branch (for docs changes before release) +# +# SETUP REQUIRED: +# Create a repository secret named DOCS_SYNC_TOKEN: +# - Go to your repo Settings > Secrets and variables > Actions +# - Create a new secret named DOCS_SYNC_TOKEN +# - Value: A GitHub PAT with 'repo' scope that has access to agentfront-docs +# +# ============================================================================= + +name: Trigger Docs Update + +on: + release: + types: [published] + pull_request: + types: [closed] + +permissions: + contents: read + +env: + REPO_NAME: enclave + +jobs: + trigger-on-release: + if: github.event_name == 'release' + runs-on: ubuntu-latest + steps: + - name: Trigger docs sync + uses: actions/github-script@v7 + with: + github-token: ${{ secrets.DOCS_SYNC_TOKEN }} + script: | + const repoName = process.env.REPO_NAME; + const tag = context.payload.release.tag_name; + const sha = context.sha; + + // Extract version minor from tag (e.g., "v2.1.0" -> "2.1") + const versionMatch = tag.match(/^v?(\d+)\.(\d+)/); + const versionMinor = versionMatch ? `${versionMatch[1]}.${versionMatch[2]}` : null; + + console.log(`Triggering docs sync for ${repoName}`); + console.log(` Tag: ${tag}`); + console.log(` SHA: ${sha}`); + console.log(` Version minor: ${versionMinor}`); + + try { + await github.rest.repos.createDispatchEvent({ + owner: 'agentfront', + repo: 'docs', + event_type: 'sync-docs', + client_payload: { + repo: repoName, + sha: sha, + tag: tag, + version_minor: versionMinor + } + }); + console.log(`Successfully triggered docs sync for ${tag}`); + } catch (error) { + console.error(`Failed to trigger docs sync: ${error.message}`); + if (error.status === 404) { + console.error('Check that DOCS_SYNC_TOKEN has access to agentfront/docs'); + } else if (error.status === 401) { + console.error('Check that DOCS_SYNC_TOKEN secret is set correctly'); + } + throw error; + } + + - name: Summary + run: | + echo "## Docs Sync Triggered (Release)" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "- **Repository:** ${{ env.REPO_NAME }}" >> $GITHUB_STEP_SUMMARY + echo "- **Tag:** ${{ github.event.release.tag_name }}" >> $GITHUB_STEP_SUMMARY + echo "- **SHA:** ${{ github.sha }}" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "The [agentfront-docs](https://github.com/agentfront/docs) repository will sync the documentation shortly." >> $GITHUB_STEP_SUMMARY + + trigger-on-pr-merge: + if: > + github.event_name == 'pull_request' && + github.event.pull_request.merged == true && + startsWith(github.event.pull_request.base.ref, 'release/') + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Determine diff base + id: diff_base + shell: bash + run: | + set -euo pipefail + + BRANCH="${{ github.event.pull_request.base.ref }}" + + # Check for .release-docs-base marker file + if [ -f ".release-docs-base" ]; then + DIFF_BASE=$(cat .release-docs-base) + echo "Using diff base from .release-docs-base: $DIFF_BASE" + else + # Fallback to branch creation point + DIFF_BASE=$(git merge-base origin/main "origin/$BRANCH" 2>/dev/null || echo "") + if [ -z "$DIFF_BASE" ]; then + DIFF_BASE="HEAD~1" + fi + echo "Using fallback diff base: $DIFF_BASE" + fi + + echo "diff_base=$DIFF_BASE" >> "$GITHUB_OUTPUT" + + # Extract version minor from branch (e.g., "release/2.1.x" -> "2.1") + VERSION_MINOR=$(echo "$BRANCH" | sed 's/release\/\([0-9]*\.[0-9]*\).*/\1/') + echo "version_minor=$VERSION_MINOR" >> "$GITHUB_OUTPUT" + echo "Version minor: $VERSION_MINOR" + + - name: Trigger docs sync + uses: actions/github-script@v7 + with: + github-token: ${{ secrets.DOCS_SYNC_TOKEN }} + script: | + const repoName = process.env.REPO_NAME; + const branch = '${{ github.event.pull_request.base.ref }}'; + const sha = context.sha; + const diffBase = '${{ steps.diff_base.outputs.diff_base }}'; + const versionMinor = '${{ steps.diff_base.outputs.version_minor }}'; + const prNumber = context.payload.pull_request.number; + const prTitle = context.payload.pull_request.title; + + console.log(`Triggering docs sync for ${repoName}`); + console.log(` Branch: ${branch}`); + console.log(` SHA: ${sha}`); + console.log(` Diff base: ${diffBase}`); + console.log(` Version minor: ${versionMinor}`); + console.log(` PR: #${prNumber} - ${prTitle}`); + + try { + await github.rest.repos.createDispatchEvent({ + owner: 'agentfront', + repo: 'docs', + event_type: 'sync-docs', + client_payload: { + repo: repoName, + sha: sha, + branch: branch, + diff_base: diffBase, + version_minor: versionMinor, + pr_number: prNumber, + pr_title: prTitle + } + }); + console.log(`Successfully triggered docs sync for PR #${prNumber}`); + } catch (error) { + console.error(`Failed to trigger docs sync: ${error.message}`); + if (error.status === 404) { + console.error('Check that DOCS_SYNC_TOKEN has access to agentfront/docs'); + } else if (error.status === 401) { + console.error('Check that DOCS_SYNC_TOKEN secret is set correctly'); + } + // Don't fail the workflow for docs sync issues + console.log('Continuing despite docs sync failure'); + } + + - name: Summary + run: | + echo "## Docs Sync Triggered (PR Merge)" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "- **Repository:** ${{ env.REPO_NAME }}" >> $GITHUB_STEP_SUMMARY + echo "- **Branch:** ${{ github.event.pull_request.base.ref }}" >> $GITHUB_STEP_SUMMARY + echo "- **PR:** #${{ github.event.pull_request.number }}" >> $GITHUB_STEP_SUMMARY + echo "- **SHA:** ${{ github.sha }}" >> $GITHUB_STEP_SUMMARY + echo "- **Diff base:** ${{ steps.diff_base.outputs.diff_base }}" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "The [agentfront-docs](https://github.com/agentfront/docs) repository will sync the documentation shortly." >> $GITHUB_STEP_SUMMARY diff --git a/CHANGELOG.md b/CHANGELOG.md index 9630683..b546096 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,7 +6,6 @@ For detailed changes to individual packages, see their respective changelogs: - [enclave-vm](libs/enclave-vm/CHANGELOG.md) - [ast-guard](libs/ast-guard/CHANGELOG.md) -- [vectoriadb](libs/vectoriadb/CHANGELOG.md) ## [Unreleased] @@ -35,7 +34,6 @@ Security-focused release aligning enclave sandbox globals with AST guard while h | Package | Version | Highlights | | ---------- | ------- | ---------------------------------------------------------------------------------------------- | | enclave-vm | 2.5.0 | Sandbox now enforces security-level-specific globals and the double-VM bootstrap was hardened. | -| vectoriadb | 2.0.2 | Regex analyzer and Redis namespace sanitization now resist ReDoS inputs. | | ast-guard | 2.2.0 | Introduced security-level-aware AgentScript globals plus safer regex pre-scanning. | ## 2026-01-06 @@ -63,7 +61,6 @@ Transformer-dependent features now load Hugging Face models lazily with optional | Package | Version | Highlights | | ---------- | ------- | ---------------------------------------------------------------- | | enclave-vm | 1.0.2 | LocalLlmScorer lazy-loads transformers, optional peer dependency | -| vectoriadb | 2.0.0 | EmbeddingService dynamic loading, injection hooks, optional peer | ## 2025-12-11 @@ -80,5 +77,4 @@ Initial release of the Enclave monorepo. | Package | Version | Highlights | | ---------- | ------- | -------------------------------------------------- | | ast-guard | 1.0.0 | AST-based JavaScript validator with CVE protection | -| vectoriadb | 1.0.0 | In-memory vector database for semantic search | | enclave-vm | 1.0.0 | Secure AgentScript execution environment | diff --git a/README.md b/README.md index 0d98f6b..0f66209 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,6 @@ **Secure sandbox runtime for AI agents** [![npm ast-guard](https://img.shields.io/npm/v/ast-guard.svg?label=ast-guard&color=e8a045)](https://www.npmjs.com/package/ast-guard) -[![npm vectoriadb](https://img.shields.io/npm/v/vectoriadb.svg?label=vectoriadb&color=e8a045)](https://www.npmjs.com/package/vectoriadb) [![npm enclave-vm](https://img.shields.io/npm/v/enclave-vm.svg?label=enclave-vm&color=e8a045)](https://www.npmjs.com/package/enclave-vm)
[![npm @enclavejs/broker](https://img.shields.io/npm/v/@enclavejs/broker.svg?label=@enclavejs/broker&color=e8a045)](https://www.npmjs.com/package/@enclavejs/broker) @@ -44,7 +43,6 @@ ```bash npm install enclave-vm # Secure JS sandbox npm install ast-guard # AST security validation -npm install vectoriadb # Vector search ``` ### Streaming Runtime @@ -67,7 +65,6 @@ npm install @enclavejs/react # React hooks & components | [`@enclavejs/types`](./libs/enclavejs-types) | TypeScript types & Zod schemas | | [`@enclavejs/stream`](./libs/enclavejs-stream) | NDJSON streaming, encryption, reconnection | | [`ast-guard`](./libs/ast-guard) | AST-based security validator | -| [`vectoriadb`](./libs/vectoriadb) | Lightweight in-memory vector database | ## Quick Start diff --git a/apps/vectoriadb-demo/project.json b/apps/vectoriadb-demo/project.json deleted file mode 100644 index 5565c53..0000000 --- a/apps/vectoriadb-demo/project.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "name": "vectoriadb-demo", - "$schema": "../../node_modules/nx/schemas/project-schema.json", - "sourceRoot": "apps/vectoriadb-demo/src", - "projectType": "application", - "targets": { - "build": { - "executor": "@nx/js:tsc", - "outputs": ["{options.outputPath}"], - "options": { - "outputPath": "dist/apps/vectoriadb-demo", - "main": "apps/vectoriadb-demo/src/main.ts", - "tsConfig": "apps/vectoriadb-demo/tsconfig.app.json" - } - }, - "serve": { - "executor": "@nx/js:node", - "options": { - "buildTarget": "vectoriadb-demo:build" - } - }, - "test": { - "options": { - "passWithNoTests": true - } - } - }, - "tags": [] -} diff --git a/apps/vectoriadb-demo/src/main.ts b/apps/vectoriadb-demo/src/main.ts deleted file mode 100644 index 48a3459..0000000 --- a/apps/vectoriadb-demo/src/main.ts +++ /dev/null @@ -1,86 +0,0 @@ -/** - * vectoriadb Demo - * - * Demonstrates in-memory vector database with TF-IDF embeddings - */ - -import { TFIDFVectoria } from 'vectoriadb'; - -interface DocMetadata { - id: string; - category: string; -} - -function main() { - console.log('=== vectoriadb Demo ===\n'); - - // Create a TF-IDF based vector database (no external dependencies) - console.log('1. Creating TF-IDF vector database...'); - const db = new TFIDFVectoria(); - console.log(' Database initialized\n'); - - // Sample documents - const documents = [ - { - id: '1', - text: 'JavaScript is a popular programming language for web development', - metadata: { id: '1', category: 'programming' }, - }, - { id: '2', text: 'TypeScript adds static typing to JavaScript', metadata: { id: '2', category: 'programming' } }, - { id: '3', text: 'Machine learning enables computers to learn from data', metadata: { id: '3', category: 'ai' } }, - { id: '4', text: 'Neural networks are a key component of deep learning', metadata: { id: '4', category: 'ai' } }, - { - id: '5', - text: 'Node.js allows running JavaScript on the server', - metadata: { id: '5', category: 'programming' }, - }, - ]; - - // Insert documents - console.log('2. Inserting documents...'); - for (const doc of documents) { - db.addDocument(doc.id, doc.text, doc.metadata); - console.log(` Added: "${doc.text.substring(0, 40)}..."`); - } - - // Reindex after adding documents - console.log(' Reindexing...'); - db.reindex(); - console.log(); - - // Search for similar documents - console.log('3. Searching for "JavaScript programming"...'); - const results1 = db.search('JavaScript programming', { topK: 3 }); - console.log(' Top 3 results:'); - for (const result of results1) { - const doc = db.getDocument(result.id); - console.log(` - [${result.score.toFixed(3)}] ${doc?.text.substring(0, 50)}...`); - } - console.log(); - - // Another search - console.log('4. Searching for "artificial intelligence"...'); - const results2 = db.search('artificial intelligence', { topK: 2 }); - console.log(' Top 2 results:'); - for (const result of results2) { - const doc = db.getDocument(result.id); - console.log(` - [${result.score.toFixed(3)}] ${doc?.text.substring(0, 50)}...`); - } - console.log(); - - // Get document count - console.log('5. Database statistics:'); - console.log(` Total documents: ${db.getDocumentCount()}`); - console.log(); - - // Delete a document - console.log('6. Deleting document "3"...'); - db.removeDocument('3'); - db.reindex(); - console.log(` New document count: ${db.getDocumentCount()}`); - console.log(); - - console.log('=== Demo Complete ==='); -} - -main(); diff --git a/apps/vectoriadb-demo/tsconfig.app.json b/apps/vectoriadb-demo/tsconfig.app.json deleted file mode 100644 index 840ba15..0000000 --- a/apps/vectoriadb-demo/tsconfig.app.json +++ /dev/null @@ -1,10 +0,0 @@ -{ - "extends": "./tsconfig.json", - "compilerOptions": { - "outDir": "../../dist/out-tsc", - "module": "commonjs", - "types": ["node"] - }, - "exclude": ["**/*.spec.ts", "**/*.test.ts"], - "include": ["src/**/*.ts"] -} diff --git a/apps/vectoriadb-demo/tsconfig.json b/apps/vectoriadb-demo/tsconfig.json deleted file mode 100644 index 3685ac7..0000000 --- a/apps/vectoriadb-demo/tsconfig.json +++ /dev/null @@ -1,13 +0,0 @@ -{ - "extends": "../../tsconfig.base.json", - "files": [], - "include": [], - "references": [ - { - "path": "./tsconfig.app.json" - } - ], - "compilerOptions": { - "esModuleInterop": true - } -} diff --git a/docs/api-reference/vectoriadb.mdx b/docs/api-reference/vectoriadb.mdx deleted file mode 100644 index 8331cad..0000000 --- a/docs/api-reference/vectoriadb.mdx +++ /dev/null @@ -1,481 +0,0 @@ ---- -title: 'vectoriadb API' -description: 'Complete API reference for the vectoriadb package' ---- - -Complete API reference for the `vectoriadb` package. - -## Installation - -```bash -npm install vectoriadb -``` - -## VectoriaDB Class - -Main vector database class. - -### Constructor - -```ts -new VectoriaDB(options?: VectoriaDBOptions) -``` - -### VectoriaDBOptions - -```ts -interface VectoriaDBOptions { - // Dimensions - dimensions?: number; - - // Distance metric - metric?: 'cosine' | 'euclidean' | 'dot'; - - // Persistence - persistence?: PersistenceAdapter; - - // Indexing - indexType?: 'flat' | 'hnsw'; - hnswOptions?: HNSWOptions; - - // Embedding - embedder?: Embedder; -} -``` - -| Property | Type | Default | Description | -|----------|------|---------|-------------| -| `dimensions` | `number` | Auto-detect | Vector dimensions | -| `metric` | `string` | `'cosine'` | Distance metric | -| `persistence` | `PersistenceAdapter` | In-memory | Storage adapter | -| `indexType` | `string` | `'flat'` | Index type | -| `embedder` | `Embedder` | `undefined` | Text embedding function | - -### Methods - -#### add(documents) - -Add documents to the database. - -```ts -async add(documents: Document | Document[]): Promise -``` - -**Parameters:** -```ts -interface Document { - id?: string; - content: string; - vector?: number[]; - metadata?: Record; -} -``` - -**Returns:** Array of document IDs - -**Example:** -```ts -const ids = await db.add([ - { content: 'First document', metadata: { category: 'a' } }, - { content: 'Second document', metadata: { category: 'b' } }, -]); -``` - -#### search(query, options?) - -Search for similar documents. - -```ts -async search(query: string | number[], options?: SearchOptions): Promise -``` - -**Parameters:** -```ts -interface SearchOptions { - k?: number; // Number of results (default: 10) - filter?: FilterQuery; // Metadata filter - threshold?: number; // Minimum similarity score -} -``` - -**Returns:** -```ts -interface SearchResult { - id: string; - content: string; - score: number; - metadata?: Record; -} -``` - -**Example:** -```ts -const results = await db.search('query text', { - k: 5, - filter: { category: 'a' }, - threshold: 0.7, -}); -``` - -#### get(id) - -Get a document by ID. - -```ts -async get(id: string): Promise -``` - -#### delete(id) - -Delete a document by ID. - -```ts -async delete(id: string): Promise -``` - -#### update(id, document) - -Update a document. - -```ts -async update(id: string, document: Partial): Promise -``` - -#### count() - -Get total document count. - -```ts -async count(): Promise -``` - -#### clear() - -Remove all documents. - -```ts -async clear(): Promise -``` - -## Filter Queries - -### Basic Filters - -```ts -// Exact match -{ category: 'tech' } - -// Multiple conditions (AND) -{ category: 'tech', status: 'published' } -``` - -### Comparison Operators - -```ts -// Greater than -{ price: { $gt: 100 } } - -// Greater than or equal -{ price: { $gte: 100 } } - -// Less than -{ price: { $lt: 100 } } - -// Less than or equal -{ price: { $lte: 100 } } - -// Not equal -{ status: { $ne: 'draft' } } - -// In array -{ category: { $in: ['tech', 'science'] } } - -// Not in array -{ category: { $nin: ['spam', 'test'] } } -``` - -### Logical Operators - -```ts -// OR -{ $or: [{ category: 'tech' }, { category: 'science' }] } - -// AND (explicit) -{ $and: [{ price: { $gt: 10 } }, { price: { $lt: 100 } }] } - -// NOT -{ $not: { status: 'draft' } } -``` - -### Array Operators - -```ts -// Contains element -{ tags: { $contains: 'javascript' } } - -// Contains all elements -{ tags: { $all: ['javascript', 'typescript'] } } -``` - -## Persistence Adapters - -### Memory Adapter (Default) - -```ts -import { VectoriaDB, MemoryAdapter } from 'vectoriadb'; - -const db = new VectoriaDB({ - persistence: new MemoryAdapter(), -}); -``` - -### File Adapter - -```ts -import { VectoriaDB, FileAdapter } from 'vectoriadb'; - -const db = new VectoriaDB({ - persistence: new FileAdapter({ - path: './data/vectors.json', - autoSave: true, - saveInterval: 30000, // 30 seconds - }), -}); -``` - -### Redis Adapter - -```ts -import { VectoriaDB, RedisAdapter } from 'vectoriadb'; -import Redis from 'ioredis'; - -const redis = new Redis(process.env.REDIS_URL); - -const db = new VectoriaDB({ - persistence: new RedisAdapter({ - client: redis, - prefix: 'vectoria:', - ttl: 86400, // 24 hours - }), -}); -``` - -### Custom Adapter - -```ts -import { PersistenceAdapter, Document } from 'vectoriadb'; - -class CustomAdapter implements PersistenceAdapter { - async load(): Promise { - // Load documents from storage - } - - async save(documents: Document[]): Promise { - // Save documents to storage - } - - async addOne(document: Document): Promise { - // Add single document - } - - async deleteOne(id: string): Promise { - // Delete single document - } -} -``` - -## HNSW Index - -For large-scale datasets, use HNSW indexing. - -```ts -import { VectoriaDB } from 'vectoriadb'; - -const db = new VectoriaDB({ - indexType: 'hnsw', - hnswOptions: { - M: 16, // Max connections per node - efConstruction: 200, // Build quality - efSearch: 100, // Search quality - }, -}); -``` - -### HNSWOptions - -```ts -interface HNSWOptions { - M?: number; // Max connections (default: 16) - efConstruction?: number; // Build quality (default: 200) - efSearch?: number; // Search quality (default: 100) -} -``` - -## Embedders - -### Custom Embedder - -```ts -import { VectoriaDB, Embedder } from 'vectoriadb'; - -const embedder: Embedder = { - async embed(text: string): Promise { - const response = await openai.embeddings.create({ - model: 'text-embedding-3-small', - input: text, - }); - return response.data[0].embedding; - }, - dimensions: 1536, -}; - -const db = new VectoriaDB({ embedder }); -``` - -### OpenAI Embedder - -```ts -import { VectoriaDB, createOpenAIEmbedder } from 'vectoriadb'; -import OpenAI from 'openai'; - -const openai = new OpenAI(); -const embedder = createOpenAIEmbedder(openai, { - model: 'text-embedding-3-small', -}); - -const db = new VectoriaDB({ embedder }); -``` - -## TF-IDF Variant - -Zero-dependency text search. - -```ts -import { VectoriaTFIDF } from 'vectoriadb/tfidf'; - -const db = new VectoriaTFIDF(); - -await db.add([ - { content: 'JavaScript programming guide' }, - { content: 'TypeScript tutorial for beginners' }, -]); - -const results = await db.search('programming tutorial'); -``` - -## Complete Example - -```ts -import { VectoriaDB, FileAdapter, createOpenAIEmbedder } from 'vectoriadb'; -import OpenAI from 'openai'; - -// Setup -const openai = new OpenAI(); -const embedder = createOpenAIEmbedder(openai); - -const db = new VectoriaDB({ - embedder, - persistence: new FileAdapter({ path: './data/docs.json' }), - indexType: 'hnsw', - metric: 'cosine', -}); - -// Add documents -await db.add([ - { - content: 'React is a JavaScript library for building user interfaces', - metadata: { category: 'frontend', language: 'javascript' }, - }, - { - content: 'Node.js is a JavaScript runtime for server-side development', - metadata: { category: 'backend', language: 'javascript' }, - }, - { - content: 'PostgreSQL is a powerful open-source relational database', - metadata: { category: 'database', language: 'sql' }, - }, -]); - -// Search with filter -const results = await db.search('building web applications', { - k: 5, - filter: { language: 'javascript' }, - threshold: 0.5, -}); - -console.log(results); -// [ -// { id: '...', content: 'React is...', score: 0.89, metadata: {...} }, -// { id: '...', content: 'Node.js is...', score: 0.72, metadata: {...} }, -// ] - -// Get document -const doc = await db.get(results[0].id); -if (!doc) { - throw new Error('Document not found'); -} - -// Update document -await db.update(results[0].id, { - metadata: { ...doc.metadata, popular: true }, -}); - -// Delete document -await db.delete(results[0].id); - -// Count -const count = await db.count(); -console.log(`Total documents: ${count}`); -``` - -## Types - -### Document - -```ts -interface Document { - id?: string; - content: string; - vector?: number[]; - metadata?: Record; -} -``` - -### SearchResult - -```ts -interface SearchResult { - id: string; - content: string; - score: number; - metadata?: Record; - vector?: number[]; -} -``` - -### Embedder - -```ts -interface Embedder { - embed(text: string): Promise; - dimensions: number; -} -``` - -### PersistenceAdapter - -```ts -interface PersistenceAdapter { - load(): Promise; - save(documents: Document[]): Promise; - addOne?(document: Document): Promise; - deleteOne?(id: string): Promise; - updateOne?(id: string, document: Partial): Promise; -} -``` - -## Related - -- [Overview](/core-libraries/vectoriadb/overview) - Feature overview -- [Search](/core-libraries/vectoriadb/search) - Search techniques -- [HNSW](/core-libraries/vectoriadb/hnsw) - HNSW indexing diff --git a/docs/changelog.mdx b/docs/changelog.mdx index 6b51610..db7909a 100644 --- a/docs/changelog.mdx +++ b/docs/changelog.mdx @@ -77,8 +77,6 @@ mode: 'center' **ast-guard v1.0.0** – Production-ready AST security guard with 100% CVE coverage for vm2/isolated-vm/node-vm exploits. - **vectoriadb v1.0.0** – Lightweight in-memory vector database with HNSW indexing for semantic search. - **enclave-vm v1.0.0** – Secure AgentScript execution environment with defense-in-depth architecture. diff --git a/docs/core-libraries/enclave-vm/ai-scoring.mdx b/docs/core-libraries/enclave-vm/ai-scoring.mdx index 56f2aa9..dc35b5c 100644 --- a/docs/core-libraries/enclave-vm/ai-scoring.mdx +++ b/docs/core-libraries/enclave-vm/ai-scoring.mdx @@ -88,6 +88,48 @@ const enclave = new Enclave({ }); ``` +### Similarity Mode with VectoriaDB + +For pattern-matching against known malicious code patterns, use similarity mode with VectoriaDB: + +```ts +import { Enclave } from 'enclave-vm'; + +const enclave = new Enclave({ + scoringGate: { + scorer: 'local-llm', + localLlm: { + modelId: 'Xenova/all-MiniLM-L6-v2', + mode: 'similarity', + vectoriaConfig: { + threshold: 0.85, // Similarity threshold (0-1) + topK: 5, // Number of results to consider + modelName: 'Xenova/all-MiniLM-L6-v2', // Optional: override embedding model + }, + }, + blockThreshold: 70, + warnThreshold: 40, + }, +}); +``` + +**VectoriaDB Configuration Options:** + +| Option | Type | Default | Description | +| ----------- | -------- | ------------------------------------ | ------------------------------------------------ | +| `threshold` | `number` | `0.85` | Similarity threshold (0-1) for considering a match | +| `topK` | `number` | `5` | Maximum number of similar patterns to return | +| `modelName` | `string` | Inherits from `localLlm.modelId` | Embedding model for similarity computation | + + +Similarity mode requires the optional `vectoriadb` peer dependency: +```bash +npm install vectoriadb +``` + + +Similarity mode can operate without the Hugging Face transformers pipeline - it will use VectoriaDB for similarity search and fall back to heuristics when needed. + ## Detection Rules The rule-based scorer evaluates these patterns: @@ -218,6 +260,42 @@ const enclave = new Enclave({ 4. **Monitor signals** - Track which rules trigger most often 5. **Layer with other defenses** - Scoring complements AST validation +## Breaking Changes + +### v2.x: VectoriaConfigForScoring API Changes + +**Removed: `indexPath` option** + +The `indexPath` option has been removed from `VectoriaConfigForScoring`. This option was intended to load pre-built malicious pattern indexes, but VectoriaDB v2.x handles persistence differently using storage adapters. + +**Migration:** + +```ts +// Before (v1.x) - No longer supported +const config = { + vectoriaConfig: { + indexPath: '/path/to/malicious-patterns.index', // REMOVED + threshold: 0.85, + }, +}; + +// After (v2.x) +const config = { + vectoriaConfig: { + threshold: 0.85, + topK: 5, + modelName: 'Xenova/all-MiniLM-L6-v2', + }, +}; +``` + +If you were using `indexPath` to load pre-built indexes, you'll need to handle persistence externally using VectoriaDB's storage adapter APIs (`saveToStorage()`, `MemoryStorageAdapter`, `FileStorageAdapter`, or `RedisStorageAdapter`). + +**New options in v2.x:** + +- `topK` - Control how many similar patterns to consider (default: 5) +- `modelName` - Override the embedding model (defaults to `localLlm.modelId`) + ## Related - [Security Levels](/core-libraries/enclave-vm/security-levels) - Security presets diff --git a/docs/core-libraries/vectoriadb/hnsw.mdx b/docs/core-libraries/vectoriadb/hnsw.mdx deleted file mode 100644 index d391c57..0000000 --- a/docs/core-libraries/vectoriadb/hnsw.mdx +++ /dev/null @@ -1,233 +0,0 @@ ---- -title: 'HNSW Indexing' -description: 'Scale to large datasets with HNSW approximate nearest neighbor search' ---- - -Enable HNSW (Hierarchical Navigable Small World) for datasets above roughly 10,000 documents. HNSW provides sub-millisecond queries with more than 95% recall. - -## When to Use HNSW - -| Documents | Recommendation | -|-----------|----------------| -| < 1,000 | Brute-force (default) | -| 1,000 - 10,000 | Either works | -| > 10,000 | Use HNSW | -| > 100,000 | HNSW required | - -## Basic Configuration - -```ts -const toolIndex = new VectoriaDB({ - useHNSW: true, - hnsw: { M: 16, efConstruction: 200, efSearch: 64 }, - maxDocuments: 150_000, - maxBatchSize: 2_000, -}); -``` - -## HNSW Parameters - -| Option | Default | Description | -| ---------------- | ------- | ---------------------------------------------------------- | -| `M` | 16 | Connections per node in layer > 0 (higher = better recall) | -| `M0` | 32 | Connections for layer 0 (typically M × 2) | -| `efConstruction` | 200 | Candidate list size during construction | -| `efSearch` | 50 | Candidate list size during search | - -## Understanding the Parameters - -### M (Max Connections) - -Controls the number of connections each node has in the graph. - -```ts -// Low M (8-12) - faster build, less memory, lower recall -const fast = new VectoriaDB({ - useHNSW: true, - hnsw: { M: 8 }, -}); - -// High M (24-48) - slower build, more memory, higher recall -const accurate = new VectoriaDB({ - useHNSW: true, - hnsw: { M: 32 }, -}); -``` - -### efConstruction - -Controls build-time quality. Higher values = better graph structure but slower indexing. - -```ts -// Fast construction, acceptable quality -const quickBuild = new VectoriaDB({ - useHNSW: true, - hnsw: { efConstruction: 100 }, -}); - -// Slow construction, excellent quality -const highQuality = new VectoriaDB({ - useHNSW: true, - hnsw: { efConstruction: 400 }, -}); -``` - -### efSearch - -Controls search-time quality. Can be adjusted per-query: - -```ts -const db = new VectoriaDB({ - useHNSW: true, - hnsw: { efSearch: 50 }, // Default -}); - -// High-precision search -const results = await db.search(query, { - topK: 10, - efSearch: 200, // Override for this query -}); - -// Fast search (lower recall) -const quick = await db.search(query, { - topK: 10, - efSearch: 20, -}); -``` - -## Performance Characteristics - -### Build Time - -| Documents | M=16, ef=200 | -|-----------|--------------| -| 10,000 | ~5 seconds | -| 50,000 | ~30 seconds | -| 100,000 | ~2 minutes | - -### Search Time - -| Documents | Brute-force | HNSW (ef=50) | -|-----------|-------------|--------------| -| 10,000 | ~50ms | ~1ms | -| 50,000 | ~250ms | ~1ms | -| 100,000 | ~500ms | ~2ms | - -### Memory Usage - -HNSW adds approximately 50-100 bytes per document for graph connections on top of the embedding storage. - -## Recall vs Speed Trade-offs - -```ts -// Configuration presets - -// Speed-optimized (95%+ recall) -const speedOptimized = { - M: 12, - efConstruction: 100, - efSearch: 30, -}; - -// Balanced (97%+ recall) -const balanced = { - M: 16, - efConstruction: 200, - efSearch: 50, -}; - -// Quality-optimized (99%+ recall) -const qualityOptimized = { - M: 32, - efConstruction: 400, - efSearch: 100, -}; -``` - -## Incremental Updates - -HNSW supports incremental updates without full rebuilds: - -```ts -const db = new VectoriaDB({ - useHNSW: true, -}); - -await db.initialize(); - -// Initial bulk load -await db.addMany(initialDocuments); - -// Later additions - HNSW index updated incrementally -await db.add('new-doc', 'New document text', { /* ... */ }); -``` - - - For very large bulk loads (100,000+ documents), consider disabling HNSW during import and enabling it after, then rebuilding the index. - - -## Persistence with HNSW - -The HNSW index structure is persisted along with embeddings: - -```ts -const db = new VectoriaDB({ - useHNSW: true, - storageAdapter: new FileStorageAdapter({ - cacheDir: './.cache/vectoriadb', - }), -}); - -await db.initialize(); -await db.addMany(documents); -await db.saveToStorage(); // Saves HNSW structure too - -// On restart, HNSW index is restored from storage -``` - -## Tuning Guidelines - -### For Real-Time Search Applications - -```ts -const realtime = new VectoriaDB({ - useHNSW: true, - hnsw: { - M: 16, - efConstruction: 200, - efSearch: 40, // Low for speed - }, -}); -``` - -### For High-Precision Applications - -```ts -const precision = new VectoriaDB({ - useHNSW: true, - hnsw: { - M: 24, - efConstruction: 400, - efSearch: 200, // High for accuracy - }, -}); -``` - -### For Memory-Constrained Environments - -```ts -const lowMemory = new VectoriaDB({ - useHNSW: true, - hnsw: { - M: 8, // Lower M uses less memory - efConstruction: 100, - efSearch: 30, - }, -}); -``` - -## Related - -- [Search](/core-libraries/vectoriadb/search) - Search options -- [Persistence](/core-libraries/vectoriadb/persistence) - Storage adapters -- [Overview](/core-libraries/vectoriadb/overview) - Getting started diff --git a/docs/core-libraries/vectoriadb/indexing.mdx b/docs/core-libraries/vectoriadb/indexing.mdx deleted file mode 100644 index 0c46f44..0000000 --- a/docs/core-libraries/vectoriadb/indexing.mdx +++ /dev/null @@ -1,188 +0,0 @@ ---- -title: 'Indexing' -description: 'Adding and updating documents in VectoriaDB' ---- - -Add documents with a unique `id`, the natural-language `text` to vectorize, and typed `metadata`. - -## Adding Documents - -### Single Document - -```ts -await toolIndex.add('users:list', 'List all users with pagination and filtering', { - id: 'users:list', - toolName: 'list', - owner: 'users', - tags: ['read', 'user-management'], - risk: 'safe', -}); -``` - -### Batch Indexing - -```ts -const documents = [ - { - id: 'billing:charge', - text: 'Charge a customer payment method', - metadata: { - id: 'billing:charge', - toolName: 'charge', - owner: 'billing', - tags: ['write', 'payment'], - risk: 'destructive', - }, - }, - { - id: 'billing:refund', - text: 'Process a refund for a customer', - metadata: { - id: 'billing:refund', - toolName: 'refund', - owner: 'billing', - tags: ['write', 'payment'], - risk: 'destructive', - }, - }, -]; - -await toolIndex.addMany(documents); -``` - -`addMany` validates every document, enforces `maxBatchSize`, and prevents duplicates. - -## Type-Safe Metadata - -Define your metadata interface for compile-time safety: - -```ts -import { VectoriaDB, DocumentMetadata } from 'vectoriadb'; - -interface ToolDocument extends DocumentMetadata { - toolName: string; - owner: string; - tags: string[]; - risk: 'safe' | 'destructive'; - deprecated?: boolean; -} - -const db = new VectoriaDB(); - -// TypeScript ensures metadata matches interface -await db.add('id', 'text', { - id: 'id', - toolName: 'test', - owner: 'system', - tags: [], - risk: 'safe', - // TypeScript error if you add wrong fields -}); -``` - -## Updating Documents - -### Update Metadata Only - -Metadata-only updates are instant and don't trigger re-embedding: - -```ts -await toolIndex.updateMetadata('users:list', { - deprecated: true, - tags: ['read', 'user-management', 'legacy'], -}); -``` - -### Update Text and Metadata - -When text changes, VectoriaDB re-embeds the document: - -```ts -await toolIndex.update('users:list', { - text: 'Updated description for user listing', - metadata: { - id: 'users:list', - toolName: 'list', - owner: 'users', - tags: ['read'], - risk: 'safe', - }, -}); -``` - -### Batch Updates - -```ts -await toolIndex.updateMany([ - { - id: 'users:list', - text: 'New description', - metadata: { /* ... */ }, - }, - { - id: 'billing:charge', - metadata: { deprecated: true }, // Metadata-only update - }, -]); -``` - - - Keep the index current with `updateMetadata`, `update`, or `updateMany`. Metadata-only updates never trigger re-embedding, while text changes re-embed only the affected documents. - - -## Removing Documents - -```ts -// Single document -await toolIndex.remove('users:list'); - -// Multiple documents -await toolIndex.removeMany(['users:list', 'billing:charge']); - -// Clear all documents -await toolIndex.clear(); -``` - -## Checking for Documents - -```ts -// Check if document exists -const exists = toolIndex.has('users:list'); - -// Get document by ID -const doc = toolIndex.get('users:list'); -if (doc) { - console.log(doc.metadata.toolName); -} - -// Get count -console.log(`Index contains ${toolIndex.size()} documents`); -``` - -## Document Limits - -VectoriaDB enforces limits to prevent DoS attacks: - -```ts -const db = new VectoriaDB({ - maxDocuments: 100000, // Maximum documents in index - maxDocumentSize: 1000000, // Maximum text size in characters - maxBatchSize: 1000, // Maximum documents per batch operation -}); -``` - -## Embedding Generation - -Embeddings are generated automatically when you add or update documents. The process: - -1. Text is tokenized using the configured model -2. Embeddings are generated (~100-200 documents/second) -3. Embeddings are stored in memory (and optionally persisted) - -For large imports, use `addMany` with appropriate `maxBatchSize` to avoid memory spikes. - -## Related - -- [Search](/core-libraries/vectoriadb/search) - Querying the index -- [Persistence](/core-libraries/vectoriadb/persistence) - Persisting embeddings -- [Overview](/core-libraries/vectoriadb/overview) - Getting started diff --git a/docs/core-libraries/vectoriadb/overview.mdx b/docs/core-libraries/vectoriadb/overview.mdx deleted file mode 100644 index a8c73aa..0000000 --- a/docs/core-libraries/vectoriadb/overview.mdx +++ /dev/null @@ -1,94 +0,0 @@ ---- -title: 'Overview' -description: 'Lightweight in-memory vector database for semantic search with offline embeddings' ---- - -VectoriaDB is a production-ready in-memory vector database built on transformers.js. Use it to surface the right tool, prompt, or document snippet from natural-language queries without shipping data to an external service. - - - - Embeddings run locally via transformers.js, so your data never leaves the server and you avoid API quotas. - - - Strong generics ensure every document you index keeps the same shape as your tool metadata. - - - Built-in rate limits, batch validation, HNSW indexing, and storage adapters keep the index production ready. - - - -## When to Use VectoriaDB - -- **Tool discovery** - Surface the right tool from natural-language queries -- **Document search** - Semantic search over documents, prompts, or code snippets -- **Recommendation systems** - Find similar items based on text embeddings -- **Offline-first applications** - No external API dependencies - - - The default Xenova `all-MiniLM-L6-v2` model is ~22 MB. The first initialization downloads and caches it under `cacheDir`; subsequent boots reuse the local copy. - - -## Installation - -```bash -npm install vectoriadb -``` - -## Quick Start - -```ts -import { VectoriaDB, DocumentMetadata } from 'vectoriadb'; - -interface ToolDocument extends DocumentMetadata { - toolName: string; - owner: string; - tags: string[]; - risk: 'safe' | 'destructive'; -} - -const toolIndex = new VectoriaDB({ - cacheDir: './.cache/transformers', - defaultSimilarityThreshold: 0.4, -}); - -await toolIndex.initialize(); // downloads and warms the embedding model once - -// Add a document -await toolIndex.add('users:list', 'List all users with pagination', { - id: 'users:list', - toolName: 'list', - owner: 'users', - tags: ['read'], - risk: 'safe', -}); - -// Search -const results = await toolIndex.search('find users'); -console.log(results[0].metadata.toolName); // 'list' -``` - -`initialize()` must run before `add`, `search`, or `update`. Calling it twice is safe because VectoriaDB short-circuits if it is already ready. - -## Core Concepts - -### Documents - -Each document has: -- **id** - Unique identifier -- **text** - Natural language text to embed -- **metadata** - Type-safe custom data - -### Embeddings - -VectoriaDB generates embeddings locally using transformers.js. The default model is `all-MiniLM-L6-v2` which provides good quality with fast inference. - -### Similarity Search - -Search returns documents ranked by cosine similarity to your query. You can filter results by metadata and set minimum similarity thresholds. - -## Related - -- [Indexing](/core-libraries/vectoriadb/indexing) - Adding and updating documents -- [Search](/core-libraries/vectoriadb/search) - Querying the index -- [Persistence](/core-libraries/vectoriadb/persistence) - Storage adapters -- [HNSW](/core-libraries/vectoriadb/hnsw) - Scaling to large datasets diff --git a/docs/core-libraries/vectoriadb/persistence.mdx b/docs/core-libraries/vectoriadb/persistence.mdx deleted file mode 100644 index c258068..0000000 --- a/docs/core-libraries/vectoriadb/persistence.mdx +++ /dev/null @@ -1,209 +0,0 @@ ---- -title: 'Persistence' -description: 'Storage adapters for persisting embeddings between restarts' ---- - -Avoid re-indexing on every boot by using storage adapters. VectoriaDB supports file, Redis, and in-memory storage. - -## Storage Adapters - -| Adapter | Use Case | Persistence | -|---------|----------|-------------| -| `MemoryStorageAdapter` | Development, testing | None (default) | -| `FileStorageAdapter` | Single-server deployment | Local disk | -| `RedisStorageAdapter` | Multi-pod deployment | Shared cache | - -## File Adapter - -Persist embeddings to local disk: - -```ts -import { VectoriaDB, FileStorageAdapter, createToolsHash } from 'vectoriadb'; - -const documents = collectToolDocuments(); - -const toolIndex = new VectoriaDB({ - storageAdapter: new FileStorageAdapter({ - cacheDir: './.cache/vectoriadb', - namespace: 'tool-index', - }), - toolsHash: createToolsHash(documents), - version: process.env.npm_package_version, -}); - -await toolIndex.initialize(); - -if (toolIndex.size() === 0) { - await toolIndex.addMany(documents); - await toolIndex.saveToStorage(); // Persist to disk -} -``` - -### File Adapter Options - -| Option | Type | Default | Description | -|--------|------|---------|-------------| -| `cacheDir` | string | `./.cache/vectoriadb` | Directory for cache files | -| `namespace` | string | `'default'` | Namespace for isolation | - -## Redis Adapter - -For multi-pod environments, use Redis to share embeddings: - -```ts -import { VectoriaDB, RedisStorageAdapter } from 'vectoriadb'; -import Redis from 'ioredis'; - -const redisClient = new Redis(); - -const toolIndex = new VectoriaDB({ - storageAdapter: new RedisStorageAdapter({ - client: redisClient, - namespace: 'tool-index', - ttl: 86400, // 24 hours (default) - keyPrefix: 'vectoriadb', - }), -}); - -await toolIndex.initialize(); - -if (toolIndex.size() === 0) { - await toolIndex.addMany(documents); - await toolIndex.saveToStorage(); -} -``` - -### Redis Adapter Options - -| Option | Type | Default | Description | -|--------|------|---------|-------------| -| `client` | Redis | required | ioredis client instance | -| `namespace` | string | `'default'` | Namespace for isolation | -| `ttl` | number | `86400` | Time-to-live in seconds | -| `keyPrefix` | string | `'vectoriadb'` | Redis key prefix | - -## Memory Adapter (Default) - -No persistence - embeddings are lost on restart: - -```ts -import { VectoriaDB, MemoryStorageAdapter } from 'vectoriadb'; - -const toolIndex = new VectoriaDB({ - storageAdapter: new MemoryStorageAdapter({ namespace: 'tools' }), -}); -``` - -Use for development or when re-indexing is fast enough. - -## Cache Invalidation - -VectoriaDB automatically invalidates the cache when documents change. Use `toolsHash` and `version` to control invalidation: - -```ts -const toolIndex = new VectoriaDB({ - storageAdapter: new FileStorageAdapter({ cacheDir: './.cache' }), - - // Hash of document contents - invalidates when documents change - toolsHash: createToolsHash(documents), - - // Application version - invalidates on deployments - version: process.env.npm_package_version, -}); -``` - -### How Invalidation Works - -On `initialize()`, VectoriaDB checks: -1. Does the cache file/key exist? -2. Does `toolsHash` match? -3. Does `version` match? -4. Does `modelName` match? - -If any check fails, the cache is invalidated and re-indexing occurs. - -## Warm-up Pattern - -Common pattern for production deployments: - -```ts -export async function warmToolIndex(documents: ToolDocument[]) { - const toolIndex = new VectoriaDB({ - storageAdapter: new FileStorageAdapter({ - cacheDir: './.cache/vectoriadb', - namespace: 'tool-index', - }), - toolsHash: createToolsHash(documents), - version: process.env.npm_package_version, - }); - - await toolIndex.initialize(); - - // Only re-index if cache was invalidated - if (toolIndex.size() === 0) { - console.log('Cache miss - re-indexing...'); - await toolIndex.addMany(documents); - await toolIndex.saveToStorage(); - } else { - console.log('Cache hit - loaded from storage'); - } - - return toolIndex; -} -``` - -## Manual Storage Operations - -```ts -// Save current state to storage -await toolIndex.saveToStorage(); - -// Load from storage (done automatically on initialize) -await toolIndex.loadFromStorage(); - -// Clear storage -await toolIndex.clearStorage(); -``` - -## Multi-Tenant Isolation - -Use namespaces to isolate different indexes: - -```ts -// Tenant A -const tenantAIndex = new VectoriaDB({ - storageAdapter: new RedisStorageAdapter({ - client: redisClient, - namespace: 'tenant-a', - }), -}); - -// Tenant B -const tenantBIndex = new VectoriaDB({ - storageAdapter: new RedisStorageAdapter({ - client: redisClient, - namespace: 'tenant-b', - }), -}); -``` - -## Error Handling - -```ts -import { StorageError } from 'vectoriadb'; - -try { - await toolIndex.saveToStorage(); -} catch (error) { - if (error instanceof StorageError) { - console.error('Storage operation failed:', error.message); - // Fallback to in-memory only - } -} -``` - -## Related - -- [Overview](/core-libraries/vectoriadb/overview) - Getting started -- [HNSW](/core-libraries/vectoriadb/hnsw) - Scaling with HNSW index -- [Indexing](/core-libraries/vectoriadb/indexing) - Adding documents diff --git a/docs/core-libraries/vectoriadb/search.mdx b/docs/core-libraries/vectoriadb/search.mdx deleted file mode 100644 index c15b479..0000000 --- a/docs/core-libraries/vectoriadb/search.mdx +++ /dev/null @@ -1,184 +0,0 @@ ---- -title: 'Semantic Search' -description: 'Querying the index with natural language and filters' ---- - -Query the VectoriaDB index with natural language and optional metadata filters. - -## Basic Search - -```ts -const results = await toolIndex.search('reset a billing password'); - -for (const result of results) { - console.log(`${result.metadata.toolName} (${result.score.toFixed(2)})`); -} -``` - -## Search Options - -```ts -const matches = await toolIndex.search('reset a billing password', { - topK: 5, // Maximum results (default: 10) - threshold: 0.45, // Minimum similarity (default: 0.3) - filter: (metadata) => // Metadata filter function - metadata.owner === 'billing' && - !metadata.tags.includes('deprecated'), - includeVector: false, // Include raw vectors in results -}); -``` - -### Options Reference - -| Option | Type | Default | Description | -| --------------- | ---------- | ------- | ------------------------------------ | -| `topK` | `number` | `10` | Maximum results to return | -| `threshold` | `number` | `0.3` | Minimum similarity score (0-1) | -| `filter` | `function` | - | Filter function for metadata | -| `includeVector` | `boolean` | `false` | Include raw vectors in results | - -## Search Results - -```ts -interface SearchResult { - id: string; // Document ID - score: number; // Similarity score (0-1) - metadata: T; // Document metadata - vector?: number[]; // Embedding vector (if includeVector: true) -} -``` - -## Filtering Results - -### Simple Filters - -```ts -// Filter by owner -const results = await toolIndex.search('user management', { - filter: (m) => m.owner === 'users', -}); - -// Filter by tag -const results = await toolIndex.search('payment', { - filter: (m) => m.tags.includes('billing'), -}); - -// Exclude deprecated -const results = await toolIndex.search('list items', { - filter: (m) => !m.deprecated, -}); -``` - -### Complex Filters - -```ts -const results = await toolIndex.search('sensitive operation', { - filter: (metadata) => { - // Must be owned by specific teams - const allowedOwners = ['billing', 'users', 'orders']; - if (!allowedOwners.includes(metadata.owner)) return false; - - // Must not be deprecated - if (metadata.deprecated) return false; - - // Must not be destructive OR user has elevated permissions - if (metadata.risk === 'destructive' && !userHasPermission) return false; - - return true; - }, -}); -``` - -## Similarity Thresholds - -Adjust thresholds based on your use case: - -```ts -// Strict matching - only highly relevant results -const strict = await toolIndex.search(query, { threshold: 0.7 }); - -// Moderate matching - good balance -const moderate = await toolIndex.search(query, { threshold: 0.5 }); - -// Loose matching - include tangentially related -const loose = await toolIndex.search(query, { threshold: 0.3 }); -``` - - - Start with a lower threshold (0.3-0.4) and increase it if you're getting too many irrelevant results. - - -## Non-Semantic Filtering - -For filtering without semantic search, use `filter()`: - -```ts -// Get all tools by owner (no semantic ranking) -const billingTools = toolIndex.filter( - (metadata) => metadata.owner === 'billing' -); - -// Get all deprecated tools -const deprecated = toolIndex.filter( - (metadata) => metadata.deprecated === true -); -``` - -## Performance Tips - -### 1. Use Appropriate topK - -Request only as many results as you need: - -```ts -// Good - only fetch what you'll display -const results = await toolIndex.search(query, { topK: 5 }); - -// Wasteful - fetching more than needed -const results = await toolIndex.search(query, { topK: 1000 }); -``` - -### 2. Use Filters to Reduce Search Space - -Apply metadata filters to narrow results before similarity ranking: - -```ts -// Filter first, then rank - more efficient -const results = await toolIndex.search(query, { - filter: (m) => m.owner === 'billing', - topK: 10, -}); -``` - -### 3. Enable HNSW for Large Datasets - -For datasets > 10,000 documents, enable HNSW indexing: - -```ts -const db = new VectoriaDB({ - useHNSW: true, - hnsw: { efSearch: 50 }, -}); -``` - -See [HNSW](/core-libraries/vectoriadb/hnsw) for details. - -## Error Handling - -```ts -import { QueryValidationError } from 'vectoriadb'; - -try { - const results = await toolIndex.search(query, { topK: -1 }); -} catch (error) { - if (error instanceof QueryValidationError) { - console.error('Invalid search parameters:', error.message); - } -} -``` - -## Related - -- [Indexing](/core-libraries/vectoriadb/indexing) - Adding documents -- [HNSW](/core-libraries/vectoriadb/hnsw) - Scaling search -- [Overview](/core-libraries/vectoriadb/overview) - Getting started diff --git a/docs/core-libraries/vectoriadb/tfidf.mdx b/docs/core-libraries/vectoriadb/tfidf.mdx deleted file mode 100644 index bf6dca4..0000000 --- a/docs/core-libraries/vectoriadb/tfidf.mdx +++ /dev/null @@ -1,184 +0,0 @@ ---- -title: 'TF-IDF Variant' -description: 'Zero-dependency keyword-based search for simple use cases' ---- - -For scenarios where ML model downloads aren't acceptable, use the TF-IDF variant. It provides keyword-based search with zero external dependencies. - -## When to Use TF-IDF - -| Scenario | Recommendation | -|----------|----------------| -| Small corpus (< 10K docs) | TF-IDF works well | -| No network access for model download | Use TF-IDF | -| Keyword matching is sufficient | Use TF-IDF | -| Semantic understanding required | Use VectoriaDB | -| Large corpus (> 10K docs) | Use VectoriaDB + HNSW | - -## Basic Usage - -```ts -import { TFIDFVectoria, DocumentMetadata } from 'vectoriadb'; - -interface ToolDocument extends DocumentMetadata { - toolName: string; - category: string; -} - -const db = new TFIDFVectoria({ - defaultSimilarityThreshold: 0.0, - defaultTopK: 10, -}); - -// Add documents -db.addDocument('tool1', 'User authentication tool', { - id: 'tool1', - toolName: 'auth', - category: 'security', -}); - -db.addDocument('tool2', 'User profile retrieval', { - id: 'tool2', - toolName: 'profile', - category: 'user', -}); - -// Reindex after adding documents (required for IDF update) -db.reindex(); - -// Search -const results = db.search('authentication', { topK: 5 }); -``` - -## Key Differences from VectoriaDB - -| Feature | TFIDFVectoria | VectoriaDB | -| ---------------------- | ------------------------------ | ----------------------------- | -| Dependencies | Zero | transformers.js (~22MB model) | -| Initialization | Synchronous | Async (model download) | -| Semantic understanding | Keyword-based | Full semantic | -| Best for | Small corpora (under 10K docs) | Any size | -| Reindex required | Yes, after changes | No | - -## Important: Reindexing - -TF-IDF requires reindexing after document changes to update IDF (Inverse Document Frequency) values: - -```ts -// Add documents -db.addDocument('doc1', 'Text one', metadata1); -db.addDocument('doc2', 'Text two', metadata2); - -// MUST reindex before searching -db.reindex(); - -// Now search works -const results = db.search('query'); - -// After adding more documents -db.addDocument('doc3', 'Text three', metadata3); -db.reindex(); // Reindex again -``` - - - Forgetting to call `reindex()` after changes will result in incorrect search results. - - -## Configuration Options - -```ts -const db = new TFIDFVectoria({ - defaultSimilarityThreshold: 0.0, // Minimum score (0-1) - defaultTopK: 10, // Default results limit -}); -``` - -## Search Options - -```ts -const results = db.search('query', { - topK: 5, // Maximum results - threshold: 0.1, // Minimum score - filter: (metadata) => metadata.category === 'security', -}); -``` - -## TF-IDF Algorithm - -TF-IDF (Term Frequency-Inverse Document Frequency) works by: - -1. **Term Frequency (TF)**: How often a term appears in a document -2. **Inverse Document Frequency (IDF)**: How rare a term is across all documents -3. **TF-IDF Score**: TF × IDF - terms that are frequent in a document but rare overall get high scores - -This means: -- Common words like "the", "is", "a" get low scores (low IDF) -- Unique terms specific to a document get high scores -- The query is matched against TF-IDF vectors using cosine similarity - -## Example: Tool Discovery - -```ts -import { TFIDFVectoria } from 'vectoriadb'; - -interface Tool { - id: string; - name: string; - category: string; -} - -const toolSearch = new TFIDFVectoria(); - -// Index tools with descriptive text -toolSearch.addDocument( - 'user-create', - 'Create new user account registration signup', - { id: 'user-create', name: 'createUser', category: 'users' } -); - -toolSearch.addDocument( - 'user-delete', - 'Delete remove user account termination', - { id: 'user-delete', name: 'deleteUser', category: 'users' } -); - -toolSearch.addDocument( - 'payment-charge', - 'Charge payment credit card billing', - { id: 'payment-charge', name: 'charge', category: 'billing' } -); - -toolSearch.reindex(); - -// Search -const results = toolSearch.search('create account'); -// Returns: user-create with high score (matches "create" and "account") -``` - -## Limitations - -1. **No semantic understanding** - "car" won't match "automobile" -2. **Reindex requirement** - Must call `reindex()` after changes -3. **Limited to keywords** - Misspellings and synonyms aren't handled -4. **Memory for large vocabularies** - IDF tables grow with vocabulary size - -## Hybrid Approach - -For best of both worlds, you can use TF-IDF as a pre-filter before semantic search: - -```ts -// Fast TF-IDF pre-filter -const tfidfResults = tfidfIndex.search(query, { topK: 100, threshold: 0.1 }); - -// Semantic re-ranking on smaller set -const semanticResults = await vectoriaDB.searchByIds( - tfidfResults.map(r => r.id), - query -); -``` - -## Related - -- [Overview](/core-libraries/vectoriadb/overview) - Getting started -- [Search](/core-libraries/vectoriadb/search) - Semantic search options -- [Persistence](/core-libraries/vectoriadb/persistence) - Storage adapters diff --git a/docs/docs.json b/docs/docs.json index a221a80..a8a42bd 100644 --- a/docs/docs.json +++ b/docs/docs.json @@ -109,18 +109,6 @@ "core-libraries/ast-guard/custom-rules" ] }, - { - "group": "vectoriadb", - "icon": "database", - "pages": [ - "core-libraries/vectoriadb/overview", - "core-libraries/vectoriadb/indexing", - "core-libraries/vectoriadb/search", - "core-libraries/vectoriadb/persistence", - "core-libraries/vectoriadb/hnsw", - "core-libraries/vectoriadb/tfidf" - ] - }, { "group": "EnclaveJS Streaming", "icon": "wave-pulse", @@ -152,7 +140,6 @@ "pages": [ "api-reference/enclave-vm", "api-reference/ast-guard", - "api-reference/vectoriadb", "api-reference/enclavejs-types", "api-reference/enclavejs-broker", "api-reference/enclavejs-client", diff --git a/docs/getting-started/installation.mdx b/docs/getting-started/installation.mdx index 3f4adf7..565d33b 100644 --- a/docs/getting-started/installation.mdx +++ b/docs/getting-started/installation.mdx @@ -30,14 +30,6 @@ AST-based JavaScript validator with 100% CVE coverage. npm install ast-guard ``` -### vectoriadb - -Lightweight in-memory vector database for semantic search. - -```bash -npm install vectoriadb -``` - ## EnclaveJS Streaming Runtime The EnclaveJS packages provide a streaming runtime for real-time code execution with tool orchestration. @@ -167,7 +159,6 @@ yarn test |---------|---------|-------------| | enclave-vm | 2.7.0 | Secure sandbox runtime | | ast-guard | 2.4.0 | AST security validator | -| vectoriadb | 2.0.2 | Vector database | | @enclavejs/types | 0.1.0 | Protocol types | | @enclavejs/stream | 0.1.0 | Streaming protocol | | @enclavejs/broker | 0.1.0 | Tool broker | diff --git a/docs/getting-started/welcome.mdx b/docs/getting-started/welcome.mdx index c6ed00c..7f878a4 100644 --- a/docs/getting-started/welcome.mdx +++ b/docs/getting-started/welcome.mdx @@ -27,16 +27,13 @@ When AI agents generate code, you need to execute it safely. Enclave provides: ## Core Libraries - + Secure JavaScript sandbox with AST validation, runtime isolation, and AI scoring gate AST-based JavaScript validator with 16 security rules and code transformation - - Lightweight in-memory vector database with HNSW indexing for semantic search - ## EnclaveJS Streaming Runtime @@ -115,9 +112,6 @@ npm install enclave-vm # AST validation npm install ast-guard -# Vector database -npm install vectoriadb - # Streaming runtime npm install @enclavejs/broker @enclavejs/client @enclavejs/react ``` diff --git a/docs/libraries/vectoriadb.mdx b/docs/libraries/vectoriadb.mdx deleted file mode 100644 index c991f8a..0000000 --- a/docs/libraries/vectoriadb.mdx +++ /dev/null @@ -1,338 +0,0 @@ ---- -title: 'vectoriadb' -description: 'Lightweight in-memory vector database for semantic search with offline embeddings and production-ready guardrails' ---- - -# vectoriadb - -VectoriaDB is a production-ready in-memory vector database built on transformers.js. Use it to surface the right tool, prompt, or document snippet from natural-language queries without shipping data to an external service. - - - - Embeddings run locally via transformers.js, so your data never leaves the server and you avoid API quotas. - - - Strong generics ensure every document you index keeps the same shape as your tool metadata. - - - Built-in rate limits, batch validation, HNSW indexing, and storage adapters keep the index production ready. - - - -## When to Use VectoriaDB - -- **Tool discovery** - Surface the right tool from natural-language queries -- **Document search** - Semantic search over documents, prompts, or code snippets -- **Recommendation systems** - Find similar items based on text embeddings -- **Offline-first applications** - No external API dependencies - - - The default Xenova `all-MiniLM-L6-v2` model is ~22 MB. The first initialization downloads and caches it under `cacheDir`; subsequent boots reuse the local copy. - - -## Installation - -```bash -npm install vectoriadb -``` - -## Quick Start - -```ts -import { VectoriaDB, DocumentMetadata } from 'vectoriadb'; - -interface ToolDocument extends DocumentMetadata { - toolName: string; - owner: string; - tags: string[]; - risk: 'safe' | 'destructive'; -} - -const toolIndex = new VectoriaDB({ - cacheDir: './.cache/transformers', - defaultSimilarityThreshold: 0.4, -}); - -await toolIndex.initialize(); // downloads and warms the embedding model once -``` - -`initialize()` must run before `add`, `search`, or `update`. Calling it twice is safe because VectoriaDB short-circuits if it is already ready. - ---- - -## Indexing Documents - -Add documents with a unique `id`, the natural-language `text` to vectorize, and typed `metadata`: - -```ts -// Single document -await toolIndex.add('users:list', 'List all users with pagination and filtering', { - id: 'users:list', - toolName: 'list', - owner: 'users', - tags: ['read', 'user-management'], - risk: 'safe', -}); - -// Batch indexing -const documents = [ - { - id: 'billing:charge', - text: 'Charge a customer payment method', - metadata: { - id: 'billing:charge', - toolName: 'charge', - owner: 'billing', - tags: ['write', 'payment'], - risk: 'destructive', - }, - }, - // ... more documents -]; - -await toolIndex.addMany(documents); -``` - -`addMany` validates every document, enforces `maxBatchSize`, and prevents duplicates. - - - Keep the index current with `updateMetadata`, `update`, or `updateMany`. Metadata-only updates never trigger re-embedding, while text changes re-embed only the affected documents. - - ---- - -## Semantic Search - -Query the index with natural language and optional filters: - -```ts -const matches = await toolIndex.search('reset a billing password', { - topK: 5, - threshold: 0.45, - filter: (metadata) => metadata.owner === 'billing' && !metadata.tags.includes('deprecated'), -}); - -for (const match of matches) { - console.log(`${match.metadata.toolName} (${match.score.toFixed(2)})`); -} -``` - -### Search Options - -| Option | Type | Default | Description | -| --------------- | ---------- | ------- | ------------------------------------ | -| `topK` | `number` | `10` | Maximum results to return | -| `threshold` | `number` | `0.3` | Minimum similarity score | -| `filter` | `function` | - | Filter function for metadata | -| `includeVector` | `boolean` | `false` | Include raw vectors in results | - ---- - -## Persistence - -Avoid re-indexing on every boot by using storage adapters. - -### File Adapter - -```ts -import { VectoriaDB, FileStorageAdapter, createToolsHash } from 'vectoriadb'; - -const toolIndex = new VectoriaDB({ - storageAdapter: new FileStorageAdapter({ - cacheDir: './.cache/vectoriadb', - namespace: 'tool-index', - }), - toolsHash: createToolsHash(documents), - version: process.env.npm_package_version, -}); - -await toolIndex.initialize(); - -if (toolIndex.size() === 0) { - await toolIndex.addMany(documents); - await toolIndex.saveToStorage(); // persist embeddings to disk -} -``` - -`toolsHash` automatically invalidates the cache when your document list or descriptions change. - -### Redis Adapter - -For multi-pod environments, use Redis to share embeddings: - -```ts -import { VectoriaDB, RedisStorageAdapter } from 'vectoriadb'; -import Redis from 'ioredis'; - -const redisClient = new Redis(); - -const toolIndex = new VectoriaDB({ - storageAdapter: new RedisStorageAdapter({ - client: redisClient, - namespace: 'tool-index', - ttl: 86400, // 24 hours (default) - keyPrefix: 'vectoriadb', - }), -}); -``` - -### Memory Adapter (Default) - -No persistence - embeddings are lost on restart: - -```ts -import { VectoriaDB, MemoryStorageAdapter } from 'vectoriadb'; - -const toolIndex = new VectoriaDB({ - storageAdapter: new MemoryStorageAdapter({ namespace: 'tools' }), -}); -``` - ---- - -## HNSW Indexing - -Enable HNSW (Hierarchical Navigable Small World) for datasets above roughly 10,000 documents. HNSW provides sub-millisecond queries with more than 95% recall. - -```ts -const toolIndex = new VectoriaDB({ - useHNSW: true, - hnsw: { M: 16, efConstruction: 200, efSearch: 64 }, - maxDocuments: 150_000, - maxBatchSize: 2_000, -}); -``` - -### HNSW Configuration - -| Option | Default | Description | -| ---------------- | ------- | ---------------------------------------------------------- | -| `M` | 16 | Connections per node in layer > 0 (higher = better recall) | -| `M0` | 32 | Connections for layer 0 (typically M × 2) | -| `efConstruction` | 200 | Candidate list size during construction | -| `efSearch` | 50 | Candidate list size during search | - ---- - -## Complete Configuration Options - -| Option | Type | Default | Description | -| ---------------------------- | --------- | --------------------------- | ------------------------------ | -| `modelName` | `string` | `'Xenova/all-MiniLM-L6-v2'` | Embedding model to use | -| `cacheDir` | `string` | `'./.cache/transformers'` | Model cache directory | -| `dimensions` | `number` | Auto-detected | Vector dimensions | -| `defaultSimilarityThreshold` | `number` | `0.3` | Minimum similarity score | -| `defaultTopK` | `number` | `10` | Default results limit | -| `useHNSW` | `boolean` | `false` | Enable HNSW index | -| `maxDocuments` | `number` | `100000` | Max documents (DoS protection) | -| `maxDocumentSize` | `number` | `1000000` | Max document size in chars | -| `maxBatchSize` | `number` | `1000` | Max batch operation size | -| `verboseErrors` | `boolean` | `true` | Enable detailed errors | - ---- - -## TF-IDF Variant (Zero Dependencies) - -For scenarios where ML model downloads aren't acceptable, use the TF-IDF variant: - -```ts -import { TFIDFVectoria } from 'vectoriadb'; - -interface ToolDocument extends DocumentMetadata { - toolName: string; - category: string; -} - -const db = new TFIDFVectoria({ - defaultSimilarityThreshold: 0.0, - defaultTopK: 10, -}); - -// Add documents -db.addDocument('tool1', 'User authentication tool', { id: 'tool1', toolName: 'auth', category: 'security' }); -db.addDocument('tool2', 'User profile retrieval', { id: 'tool2', toolName: 'profile', category: 'user' }); - -// Reindex after adding documents (required for IDF update) -db.reindex(); - -// Search -const results = db.search('authentication', { topK: 5 }); -``` - -### When to Use TF-IDF vs ML Embeddings - -| Feature | TFIDFVectoria | VectoriaDB | -| ---------------------- | ------------------------------ | ----------------------------- | -| Dependencies | Zero | transformers.js (~22MB model) | -| Initialization | Synchronous | Async (model download) | -| Semantic understanding | Keyword-based | Full semantic | -| Best for | Small corpora (under 10K docs) | Any size | -| Reindex required | Yes, after changes | No | - ---- - -## Error Handling - -All errors extend `VectoriaError` and include machine-readable `code` values: - -| Error | Code | Description | -| ----------------------------- | --------------------------- | -------------------------- | -| `VectoriaNotInitializedError` | `NOT_INITIALIZED` | Call `initialize()` first | -| `DocumentValidationError` | `DOCUMENT_VALIDATION_ERROR` | Invalid document data | -| `DocumentNotFoundError` | `DOCUMENT_NOT_FOUND` | Document ID doesn't exist | -| `DocumentExistsError` | `DOCUMENT_EXISTS` | Document ID already exists | -| `DuplicateDocumentError` | `DUPLICATE_DOCUMENT` | Duplicate in batch | -| `QueryValidationError` | `QUERY_VALIDATION_ERROR` | Invalid search query | -| `EmbeddingError` | `EMBEDDING_ERROR` | Model embedding failed | -| `StorageError` | `STORAGE_ERROR` | Storage operation failed | -| `ConfigurationError` | `CONFIGURATION_ERROR` | Invalid config | - -```ts -import { - VectoriaError, - VectoriaNotInitializedError, - DocumentValidationError, -} from 'vectoriadb'; - -try { - await toolIndex.add(doc.id, doc.text, doc.metadata); -} catch (error) { - if (error instanceof VectoriaNotInitializedError) { - await toolIndex.initialize(); - } else if (error instanceof DocumentValidationError) { - console.warn({ tool: error.documentId }, 'invalid document skipped'); - } else if (error instanceof VectoriaError) { - console.error({ code: error.code }, error.message); - throw error; - } else { - throw error; - } -} -``` - ---- - -## Monitoring & Stats - -Use `getStats()` to feed dashboards or health endpoints: - -```ts -const stats = toolIndex.getStats(); -/* -{ - totalEmbeddings: number; - dimensions: number; - estimatedMemoryBytes: number; - modelName: string; -} -*/ -``` - - - Pair stats with `toolIndex.size()`, `toolIndex.clear()`, and `toolIndex.clearStorage()` for maintenance commands or admin tooling. - - -## Links - -- [GitHub](https://github.com/agentfront/enclave/tree/main/libs/vectoriadb) -- [npm](https://www.npmjs.com/package/vectoriadb) diff --git a/libs/enclave-vm/eslint.config.mjs b/libs/enclave-vm/eslint.config.mjs index 513ed4c..88b9305 100644 --- a/libs/enclave-vm/eslint.config.mjs +++ b/libs/enclave-vm/eslint.config.mjs @@ -11,6 +11,7 @@ export default [ ignoredFiles: ['{projectRoot}/eslint.config.{js,cjs,mjs,ts,cts,mts}'], ignoredDependencies: [ '@huggingface/transformers', // Optional peer dependency loaded dynamically + 'vectoriadb', // Optional peer dependency loaded dynamically '@jest/reporters', // Test-only dependency used by benchmark reporter ], }, diff --git a/libs/enclave-vm/package.json b/libs/enclave-vm/package.json index d116b9c..0ad5204 100644 --- a/libs/enclave-vm/package.json +++ b/libs/enclave-vm/package.json @@ -44,11 +44,15 @@ "zod": "^4.1.13" }, "peerDependencies": { - "@huggingface/transformers": "^3.2.2" + "@huggingface/transformers": "^3.2.2", + "vectoriadb": "^2.0.0" }, "peerDependenciesMeta": { "@huggingface/transformers": { "optional": true + }, + "vectoriadb": { + "optional": true } } } diff --git a/libs/enclave-vm/src/scoring/__tests__/local-llm-scorer.spec.ts b/libs/enclave-vm/src/scoring/__tests__/local-llm-scorer.spec.ts index 7d35785..ca3d69b 100644 --- a/libs/enclave-vm/src/scoring/__tests__/local-llm-scorer.spec.ts +++ b/libs/enclave-vm/src/scoring/__tests__/local-llm-scorer.spec.ts @@ -404,6 +404,69 @@ describe('LocalLlmScorer', () => { scorer.dispose(); }); + + it('should respect vectoriaConfig topK option', () => { + const config: LocalLlmConfig = { + modelId: 'Xenova/all-MiniLM-L6-v2', + mode: 'similarity', + vectoriaConfig: { + threshold: 0.9, + topK: 10, + modelName: 'custom-model', + }, + fallbackToRules: true, + }; + + const scorer = new LocalLlmScorer(config); + const storedConfig = scorer.getConfig(); + + expect(storedConfig.vectoriaConfig?.threshold).toBe(0.9); + expect(storedConfig.vectoriaConfig?.topK).toBe(10); + expect(storedConfig.vectoriaConfig?.modelName).toBe('custom-model'); + }); + + it('should report VectoriaDB availability status', async () => { + const config: LocalLlmConfig = { + modelId: 'Xenova/all-MiniLM-L6-v2', + mode: 'similarity', + fallbackToRules: true, + }; + + const scorer = new LocalLlmScorer(config); + await scorer.initialize(); + + // VectoriaDB won't be available in tests since it's not installed + expect(typeof scorer.isVectoriaDBAvailable()).toBe('boolean'); + + scorer.dispose(); + }); + + it('should fall back to heuristics when VectoriaDB is unavailable', async () => { + const config: LocalLlmConfig = { + modelId: 'Xenova/all-MiniLM-L6-v2', + mode: 'similarity', + vectoriaConfig: { + threshold: 0.85, + topK: 5, + }, + fallbackToRules: true, + }; + + const scorer = new LocalLlmScorer(config); + await scorer.initialize(); + + // In tests, VectoriaDB is not installed, so it should fall back to heuristics + const features = createSensitiveFeatures(); + const result = await scorer.score(features); + + // Should still produce meaningful scores from heuristics + expect(result.scorerType).toBe('local-llm'); + expect(result.totalScore).toBeGreaterThan(0); + // Should have heuristic signals since VectoriaDB is not available + expect(result.signals.length).toBeGreaterThan(0); + + scorer.dispose(); + }); }); describe('getConfig()', () => { @@ -774,4 +837,654 @@ describe('LocalLlmScorer', () => { scorer.dispose(); }); }); + + describe('VectoriaDB similarity mode integration', () => { + // Mock VectoriaDB module + let mockVectoriaDB: { + initialize: jest.Mock; + search: jest.Mock; + }; + + let originalImport: typeof Function; + + beforeEach(() => { + mockVectoriaDB = { + initialize: jest.fn().mockResolvedValue(undefined), + search: jest.fn().mockResolvedValue([]), + }; + + // Store original Function constructor + originalImport = Function; + }); + + afterEach(() => { + jest.restoreAllMocks(); + }); + + describe('similarity mode without VectoriaDB installed', () => { + it('should gracefully handle missing VectoriaDB package', async () => { + const config: LocalLlmConfig = { + modelId: 'Xenova/all-MiniLM-L6-v2', + mode: 'similarity', + vectoriaConfig: { + threshold: 0.85, + topK: 5, + }, + fallbackToRules: true, + }; + + const scorer = new LocalLlmScorer(config); + await scorer.initialize(); + + // Should initialize without errors even when VectoriaDB is not installed + expect(scorer.isReady()).toBe(true); + expect(scorer.isVectoriaDBAvailable()).toBe(false); + + scorer.dispose(); + }); + + it('should produce scores from heuristics when VectoriaDB is unavailable', async () => { + const config: LocalLlmConfig = { + modelId: 'Xenova/all-MiniLM-L6-v2', + mode: 'similarity', + vectoriaConfig: { + threshold: 0.9, + topK: 3, + }, + fallbackToRules: true, + }; + + const scorer = new LocalLlmScorer(config); + await scorer.initialize(); + + const features = createExfiltrationFeatures(); + const result = await scorer.score(features); + + // Should still detect patterns using heuristics or rule-based fallback + expect(result.totalScore).toBeGreaterThan(0); + expect(result.signals.length).toBeGreaterThan(0); + + // Should have exfiltration-related signals (either from ML heuristics or rule-based) + const hasExfilSignal = result.signals.some( + (s) => + s.id === 'ML_EXFILTRATION_PATTERN' || + s.id === 'ML_HIGH_FANOUT' || + s.id === 'EXFIL_PATTERN' || + s.id === 'EXCESSIVE_LIMIT', + ); + expect(hasExfilSignal).toBe(true); + + scorer.dispose(); + }); + + it('should handle sensitive data detection in similarity mode without VectoriaDB', async () => { + const config: LocalLlmConfig = { + modelId: 'Xenova/all-MiniLM-L6-v2', + mode: 'similarity', + fallbackToRules: true, + }; + + const scorer = new LocalLlmScorer(config); + await scorer.initialize(); + + const features = createSensitiveFeatures(); + const result = await scorer.score(features); + + // Should detect sensitive keywords via heuristics or rule-based fallback + expect(result.totalScore).toBeGreaterThan(0); + const hasSensitiveSignal = result.signals.some( + (s) => s.id === 'ML_CRITICAL_KEYWORD' || s.id === 'SENSITIVE_FIELD', + ); + expect(hasSensitiveSignal).toBe(true); + + scorer.dispose(); + }); + }); + + describe('vectoriaConfig options', () => { + it('should use default threshold of 0.85 when not specified', () => { + const config: LocalLlmConfig = { + modelId: 'Xenova/all-MiniLM-L6-v2', + mode: 'similarity', + vectoriaConfig: {}, + fallbackToRules: true, + }; + + const scorer = new LocalLlmScorer(config); + const storedConfig = scorer.getConfig(); + + // threshold should be undefined in config, but scorer uses 0.85 default internally + expect(storedConfig.vectoriaConfig?.threshold).toBeUndefined(); + }); + + it('should use default topK of 5 when not specified', () => { + const config: LocalLlmConfig = { + modelId: 'Xenova/all-MiniLM-L6-v2', + mode: 'similarity', + vectoriaConfig: { + threshold: 0.9, + }, + fallbackToRules: true, + }; + + const scorer = new LocalLlmScorer(config); + const storedConfig = scorer.getConfig(); + + // topK should be undefined in config, but scorer uses 5 default internally + expect(storedConfig.vectoriaConfig?.topK).toBeUndefined(); + }); + + it('should allow custom threshold values', () => { + const thresholds = [0.5, 0.75, 0.9, 0.95, 1.0]; + + for (const threshold of thresholds) { + const config: LocalLlmConfig = { + modelId: 'Xenova/all-MiniLM-L6-v2', + mode: 'similarity', + vectoriaConfig: { threshold }, + fallbackToRules: true, + }; + + const scorer = new LocalLlmScorer(config); + expect(scorer.getConfig().vectoriaConfig?.threshold).toBe(threshold); + } + }); + + it('should allow custom topK values', () => { + const topKValues = [1, 3, 5, 10, 20, 100]; + + for (const topK of topKValues) { + const config: LocalLlmConfig = { + modelId: 'Xenova/all-MiniLM-L6-v2', + mode: 'similarity', + vectoriaConfig: { topK }, + fallbackToRules: true, + }; + + const scorer = new LocalLlmScorer(config); + expect(scorer.getConfig().vectoriaConfig?.topK).toBe(topK); + } + }); + + it('should use modelId as default modelName for VectoriaDB', () => { + const config: LocalLlmConfig = { + modelId: 'Xenova/custom-embedding-model', + mode: 'similarity', + vectoriaConfig: {}, + fallbackToRules: true, + }; + + const scorer = new LocalLlmScorer(config); + const storedConfig = scorer.getConfig(); + + // modelName not specified, should use modelId internally + expect(storedConfig.vectoriaConfig?.modelName).toBeUndefined(); + expect(storedConfig.modelId).toBe('Xenova/custom-embedding-model'); + }); + + it('should prefer vectoriaConfig.modelName over modelId', () => { + const config: LocalLlmConfig = { + modelId: 'Xenova/all-MiniLM-L6-v2', + mode: 'similarity', + vectoriaConfig: { + modelName: 'Xenova/custom-embedding-model', + }, + fallbackToRules: true, + }; + + const scorer = new LocalLlmScorer(config); + const storedConfig = scorer.getConfig(); + + expect(storedConfig.vectoriaConfig?.modelName).toBe('Xenova/custom-embedding-model'); + }); + }); + + describe('similarity mode with different feature types', () => { + it('should handle low-risk features in similarity mode', async () => { + const config: LocalLlmConfig = { + modelId: 'Xenova/all-MiniLM-L6-v2', + mode: 'similarity', + vectoriaConfig: { + threshold: 0.85, + topK: 5, + }, + fallbackToRules: true, + }; + + const scorer = new LocalLlmScorer(config); + await scorer.initialize(); + + const features = createLowRiskFeatures(); + const result = await scorer.score(features); + + // Low risk features should result in low score + expect(result.totalScore).toBeLessThan(30); + expect(result.riskLevel).toMatch(/^(none|low)$/); + + scorer.dispose(); + }); + + it('should handle high-risk exfiltration features in similarity mode', async () => { + const config: LocalLlmConfig = { + modelId: 'Xenova/all-MiniLM-L6-v2', + mode: 'similarity', + vectoriaConfig: { + threshold: 0.85, + topK: 5, + }, + fallbackToRules: true, + }; + + const scorer = new LocalLlmScorer(config); + await scorer.initialize(); + + const features = createExfiltrationFeatures(); + const result = await scorer.score(features); + + // Should detect exfiltration pattern via heuristics + expect(result.totalScore).toBeGreaterThan(30); + expect(result.signals.some((s) => s.id.includes('EXFIL') || s.id.includes('FANOUT'))).toBe(true); + + scorer.dispose(); + }); + + it('should handle multi-category sensitive features in similarity mode', async () => { + const config: LocalLlmConfig = { + modelId: 'Xenova/all-MiniLM-L6-v2', + mode: 'similarity', + vectoriaConfig: { + threshold: 0.85, + topK: 5, + }, + fallbackToRules: true, + }; + + const scorer = new LocalLlmScorer(config); + await scorer.initialize(); + + const features = createMultiSensitiveFeatures(); + const result = await scorer.score(features); + + // Should detect multiple sensitive categories via heuristics or rule-based + expect(result.totalScore).toBeGreaterThan(20); + const hasSensitiveSignal = result.signals.some( + (s) => s.id === 'ML_MULTI_SENSITIVE' || s.id === 'SENSITIVE_FIELD' || s.id === 'ML_CRITICAL_KEYWORD', + ); + expect(hasSensitiveSignal).toBe(true); + + scorer.dispose(); + }); + }); + + describe('scoring time tracking', () => { + it('should track scoring time in similarity mode', async () => { + const config: LocalLlmConfig = { + modelId: 'Xenova/all-MiniLM-L6-v2', + mode: 'similarity', + fallbackToRules: true, + }; + + const scorer = new LocalLlmScorer(config); + await scorer.initialize(); + + const features = createLowRiskFeatures(); + const result = await scorer.score(features); + + expect(result.scoringTimeMs).toBeGreaterThanOrEqual(0); + expect(typeof result.scoringTimeMs).toBe('number'); + + scorer.dispose(); + }); + + it('should report correct scorer type in similarity mode', async () => { + const config: LocalLlmConfig = { + modelId: 'Xenova/all-MiniLM-L6-v2', + mode: 'similarity', + fallbackToRules: true, + }; + + const scorer = new LocalLlmScorer(config); + await scorer.initialize(); + + const features = createSensitiveFeatures(); + const result = await scorer.score(features); + + expect(result.scorerType).toBe('local-llm'); + + scorer.dispose(); + }); + }); + + describe('dispose cleanup in similarity mode', () => { + it('should clean up VectoriaDB resources on dispose', async () => { + const config: LocalLlmConfig = { + modelId: 'Xenova/all-MiniLM-L6-v2', + mode: 'similarity', + vectoriaConfig: { + threshold: 0.85, + }, + fallbackToRules: true, + }; + + const scorer = new LocalLlmScorer(config); + await scorer.initialize(); + + expect(scorer.isReady()).toBe(true); + + scorer.dispose(); + + expect(scorer.isReady()).toBe(false); + expect(scorer.isVectoriaDBAvailable()).toBe(false); + }); + + it('should be safe to dispose multiple times', async () => { + const config: LocalLlmConfig = { + modelId: 'Xenova/all-MiniLM-L6-v2', + mode: 'similarity', + fallbackToRules: true, + }; + + const scorer = new LocalLlmScorer(config); + await scorer.initialize(); + + // Dispose multiple times should not throw + expect(() => { + scorer.dispose(); + scorer.dispose(); + scorer.dispose(); + }).not.toThrow(); + }); + }); + + describe('similarity mode with custom analyzer', () => { + it('should use custom analyzer in similarity mode', async () => { + const customAnalyzer: CustomAnalyzer = { + analyze: jest.fn().mockResolvedValue({ + score: 60, + signals: [ + { + id: 'CUSTOM_SIMILARITY_SIGNAL', + score: 60, + description: 'Custom similarity analysis', + level: 'medium' as const, + }, + ], + }), + }; + + const config: LocalLlmConfig = { + modelId: 'Xenova/all-MiniLM-L6-v2', + mode: 'similarity', + vectoriaConfig: { + threshold: 0.85, + }, + fallbackToRules: true, + customAnalyzer, + }; + + const scorer = new LocalLlmScorer(config); + await scorer.initialize(); + + const features = createLowRiskFeatures(); + const result = await scorer.score(features); + + expect(customAnalyzer.analyze).toHaveBeenCalled(); + expect(result.totalScore).toBe(60); + expect(result.signals.some((s) => s.id === 'CUSTOM_SIMILARITY_SIGNAL')).toBe(true); + + scorer.dispose(); + }); + + it('should fall back to rules when custom analyzer fails in similarity mode', async () => { + const customAnalyzer: CustomAnalyzer = { + analyze: jest.fn().mockRejectedValue(new Error('Similarity analysis failed')), + }; + + const config: LocalLlmConfig = { + modelId: 'Xenova/all-MiniLM-L6-v2', + mode: 'similarity', + fallbackToRules: true, + customAnalyzer, + }; + + const scorer = new LocalLlmScorer(config); + await scorer.initialize(); + + const features = createSensitiveFeatures(); + const result = await scorer.score(features); + + // Should fall back to rule-based scoring + expect(result.scorerType).toBe('local-llm'); + expect(typeof result.totalScore).toBe('number'); + + scorer.dispose(); + }); + }); + + describe('mode switching', () => { + it('should handle switching between classification and similarity modes', async () => { + // First, test classification mode + const classificationScorer = new LocalLlmScorer({ + modelId: 'Xenova/all-MiniLM-L6-v2', + mode: 'classification', + fallbackToRules: true, + }); + await classificationScorer.initialize(); + + const features = createSensitiveFeatures(); + const classResult = await classificationScorer.score(features); + + expect(classResult.scorerType).toBe('local-llm'); + expect(classResult.totalScore).toBeGreaterThan(0); + + classificationScorer.dispose(); + + // Now test similarity mode + const similarityScorer = new LocalLlmScorer({ + modelId: 'Xenova/all-MiniLM-L6-v2', + mode: 'similarity', + vectoriaConfig: { + threshold: 0.85, + }, + fallbackToRules: true, + }); + await similarityScorer.initialize(); + + const simResult = await similarityScorer.score(features); + + expect(simResult.scorerType).toBe('local-llm'); + expect(simResult.totalScore).toBeGreaterThan(0); + + similarityScorer.dispose(); + }); + + it('should default to classification mode when mode is not specified', async () => { + const config: LocalLlmConfig = { + modelId: 'Xenova/all-MiniLM-L6-v2', + fallbackToRules: true, + // mode not specified + }; + + const scorer = new LocalLlmScorer(config); + const storedConfig = scorer.getConfig(); + + expect(storedConfig.mode).toBe('classification'); + }); + }); + + describe('edge cases', () => { + it('should handle empty tool calls in similarity mode', async () => { + const config: LocalLlmConfig = { + modelId: 'Xenova/all-MiniLM-L6-v2', + mode: 'similarity', + fallbackToRules: true, + }; + + const scorer = new LocalLlmScorer(config); + await scorer.initialize(); + + const features: ExtractedFeatures = { + toolCalls: [], + patterns: { + totalToolCalls: 0, + uniqueToolsCount: 0, + toolsInLoops: [], + maxLoopNesting: 0, + toolSequence: [], + iteratesOverToolResults: false, + }, + signals: { + maxLimit: 0, + maxStringLength: 0, + toolCallDensity: 0, + fanOutRisk: 0, + }, + sensitive: { + fieldsAccessed: [], + categories: [], + }, + meta: { + extractionTimeMs: 1, + codeHash: 'empty-hash', + lineCount: 0, + }, + }; + + const result = await scorer.score(features); + + expect(result.totalScore).toBe(0); + expect(result.riskLevel).toBe('none'); + + scorer.dispose(); + }); + + it('should handle very long tool sequences in similarity mode', async () => { + const config: LocalLlmConfig = { + modelId: 'Xenova/all-MiniLM-L6-v2', + mode: 'similarity', + vectoriaConfig: { + threshold: 0.85, + topK: 5, + }, + fallbackToRules: true, + }; + + const scorer = new LocalLlmScorer(config); + await scorer.initialize(); + + const toolCalls = Array.from({ length: 50 }, (_, i) => ({ + toolName: `tool:action${i}`, + isStaticName: true, + argumentKeys: ['arg1'], + stringLiterals: [], + numericLiterals: [], + insideLoop: false, + loopDepth: 0, + location: { line: i + 1, column: 1 }, + })); + + const features: ExtractedFeatures = { + toolCalls, + patterns: { + totalToolCalls: 50, + uniqueToolsCount: 50, + toolsInLoops: [], + maxLoopNesting: 0, + toolSequence: toolCalls.map((tc) => tc.toolName), + iteratesOverToolResults: false, + }, + signals: { + maxLimit: 100, + maxStringLength: 50, + toolCallDensity: 1.0, + fanOutRisk: 40, + }, + sensitive: { + fieldsAccessed: [], + categories: [], + }, + meta: { + extractionTimeMs: 1, + codeHash: 'long-sequence-hash', + lineCount: 50, + }, + }; + + const result = await scorer.score(features); + + expect(result.scorerType).toBe('local-llm'); + expect(typeof result.totalScore).toBe('number'); + + scorer.dispose(); + }); + + it('should handle features with all risk signals present in similarity mode', async () => { + const config: LocalLlmConfig = { + modelId: 'Xenova/all-MiniLM-L6-v2', + mode: 'similarity', + fallbackToRules: true, + }; + + const scorer = new LocalLlmScorer(config); + await scorer.initialize(); + + // Create features with multiple risk indicators + const features: ExtractedFeatures = { + toolCalls: [ + { + toolName: 'users:list', + isStaticName: true, + argumentKeys: ['password', 'token', 'secret'], + stringLiterals: ['password', 'apikey', 'credential'], + numericLiterals: [10000], + insideLoop: true, + loopDepth: 2, + location: { line: 1, column: 1 }, + }, + { + toolName: 'webhook:send', + isStaticName: true, + argumentKeys: ['url', 'data'], + stringLiterals: ['http://external.com'], + numericLiterals: [], + insideLoop: false, + loopDepth: 0, + location: { line: 5, column: 1 }, + }, + ], + patterns: { + totalToolCalls: 2, + uniqueToolsCount: 2, + toolsInLoops: ['users:list'], + maxLoopNesting: 2, + toolSequence: ['users:list', 'webhook:send'], + iteratesOverToolResults: true, + }, + signals: { + maxLimit: 10000, + maxStringLength: 100, + toolCallDensity: 0.5, + fanOutRisk: 80, + }, + sensitive: { + fieldsAccessed: ['password', 'token', 'ssn', 'creditCard'], + categories: ['authentication', 'pii', 'financial'], + }, + meta: { + extractionTimeMs: 1, + codeHash: 'high-risk-hash', + lineCount: 10, + }, + }; + + const result = await scorer.score(features); + + // Should have high score due to multiple risk factors + expect(result.totalScore).toBeGreaterThan(50); + expect(result.signals.length).toBeGreaterThan(0); + + scorer.dispose(); + }); + }); + }); }); diff --git a/libs/enclave-vm/src/scoring/scorers/local-llm.scorer.ts b/libs/enclave-vm/src/scoring/scorers/local-llm.scorer.ts index e1add92..ae9a95f 100644 --- a/libs/enclave-vm/src/scoring/scorers/local-llm.scorer.ts +++ b/libs/enclave-vm/src/scoring/scorers/local-llm.scorer.ts @@ -19,6 +19,18 @@ import { join } from 'node:path'; // Pipeline type from @huggingface/transformers type Pipeline = (input: string, options?: Record) => Promise<{ data: number[] }>; +// VectoriaDB types (optional dependency) +interface VectoriaSearchResult { + id: string; + score: number; + metadata?: Record; +} + +interface VectoriaDBInstance { + initialize(): Promise; + search(query: string, options?: { topK?: number; threshold?: number }): Promise; +} + /** * Default model cache directory */ @@ -87,6 +99,7 @@ export class LocalLlmScorer extends BaseScorer { private initPromise: Promise | null = null; private readonly fallbackScorer: RuleBasedScorer | null; private readonly config: LocalLlmConfig; + private vectoriaDB: VectoriaDBInstance | null = null; constructor(config: LocalLlmConfig) { super(); @@ -145,6 +158,11 @@ export class LocalLlmScorer extends BaseScorer { await this.config.customAnalyzer.initialize(); } + // Initialize VectoriaDB for similarity mode + if (this.config.mode === 'similarity') { + await this.initializeVectoriaDB(); + } + this.ready = true; } catch (error) { this.initPromise = null; @@ -161,6 +179,11 @@ export class LocalLlmScorer extends BaseScorer { await this.config.customAnalyzer.initialize(); } + // Try to initialize VectoriaDB for similarity mode even if model fails + if (this.config.mode === 'similarity') { + await this.initializeVectoriaDB(); + } + this.ready = true; // Ready with fallback } else { throw new LocalLlmScorerError( @@ -170,6 +193,33 @@ export class LocalLlmScorer extends BaseScorer { } } + /** + * Initialize VectoriaDB for similarity-based scoring + */ + private async initializeVectoriaDB(): Promise { + try { + // Dynamic import of VectoriaDB (optional dependency) + // eslint-disable-next-line @typescript-eslint/no-explicit-any + const vectoriaModule = await (Function('return import("vectoriadb")')() as Promise); + const { VectoriaDB } = vectoriaModule; + + const modelName = this.config.vectoriaConfig?.modelName ?? this.config.modelId; + + this.vectoriaDB = new VectoriaDB({ + modelName, + }) as VectoriaDBInstance; + + await this.vectoriaDB.initialize(); + } catch (error) { + console.warn( + `[LocalLlmScorer] VectoriaDB initialization failed, similarity mode will use heuristics: ${ + error instanceof Error ? error.message : String(error) + }`, + ); + this.vectoriaDB = null; + } + } + /** * Score the extracted features */ @@ -202,7 +252,8 @@ export class LocalLlmScorer extends BaseScorer { } // If model failed to load and we have fallback (no custom analyzer) - if (!this.pipeline && this.fallbackScorer) { + // Skip fallback for similarity mode - it can work without the pipeline using VectoriaDB/heuristics + if (!this.pipeline && this.fallbackScorer && this.config.mode !== 'similarity') { const result = await this.fallbackScorer.score(features); return { ...result, @@ -254,21 +305,47 @@ export class LocalLlmScorer extends BaseScorer { /** * Score using similarity to known malicious patterns + * + * Uses VectoriaDB to find similar patterns in a pre-built index. + * Falls back to heuristic analysis if VectoriaDB is not available. */ private async scoreWithSimilarity(features: ExtractedFeatures, startTime: number): Promise { - // Convert features to text prompt const prompt = this.featuresToPrompt(features); + const signals: RiskSignal[] = []; + let score = 0; - // Score using custom analyzer or built-in heuristics - const { score, signals } = await this.analyzePrompt(prompt, features); + // Try VectoriaDB similarity search if available + if (this.vectoriaDB) { + const threshold = this.config.vectoriaConfig?.threshold ?? 0.85; + const topK = this.config.vectoriaConfig?.topK ?? 5; - // Add similarity mode signal - signals.push({ - id: 'SIMILARITY_MODE', - score: 0, - description: 'Similarity scoring (VectoriaDB integration pending)', - level: 'none' as RiskLevel, - }); + try { + const results = await this.vectoriaDB.search(prompt, { topK, threshold }); + + if (results.length > 0) { + // Calculate score based on similarity matches + const maxSimilarity = Math.max(...results.map((r) => r.score)); + score = Math.floor(maxSimilarity * 100); + + signals.push({ + id: 'SIMILARITY_MATCH', + score, + description: `Matched ${results.length} known malicious pattern(s)`, + level: this.calculateRiskLevel(score), + context: { matches: results.map((r) => ({ id: r.id, score: r.score })) }, + }); + } + } catch (error) { + console.warn( + `[LocalLlmScorer] VectoriaDB search failed: ${error instanceof Error ? error.message : String(error)}`, + ); + } + } + + // Also run heuristic analysis as supplementary + const heuristics = await this.analyzePrompt(prompt, features); + signals.push(...heuristics.signals); + score = Math.max(score, heuristics.score); return { totalScore: this.clampScore(score), @@ -465,12 +542,20 @@ export class LocalLlmScorer extends BaseScorer { return this.pipeline === null && this.fallbackScorer !== null && this.ready; } + /** + * Check if VectoriaDB is available for similarity scoring + */ + isVectoriaDBAvailable(): boolean { + return this.vectoriaDB !== null; + } + /** * Dispose of resources */ override dispose(): void { this.pipeline = null; this.initPromise = null; + this.vectoriaDB = null; this.fallbackScorer?.dispose?.(); this.config.customAnalyzer?.dispose?.(); super.dispose(); diff --git a/libs/enclave-vm/src/scoring/types.ts b/libs/enclave-vm/src/scoring/types.ts index a20e7a7..4751f83 100644 --- a/libs/enclave-vm/src/scoring/types.ts +++ b/libs/enclave-vm/src/scoring/types.ts @@ -342,18 +342,45 @@ export interface ScoringGateResult { /** * Configuration for VectoriaDB-based similarity scoring + * + * Requires the `vectoriadb` package to be installed: + * ```bash + * npm install vectoriadb + * ``` + * See: https://github.com/agentfront/vectoriadb + * + * @breaking v2.x removed the `indexPath` option. VectoriaDB v2.x handles persistence + * via storage adapters (MemoryStorageAdapter, FileStorageAdapter, RedisStorageAdapter) + * instead of loadIndex(). If you need to persist patterns, use VectoriaDB's + * saveToStorage() API directly. + * + * @example + * ```typescript + * const config: VectoriaConfigForScoring = { + * threshold: 0.85, // Match threshold + * topK: 5, // Return top 5 matches + * modelName: 'Xenova/all-MiniLM-L6-v2', + * }; + * ``` */ export interface VectoriaConfigForScoring { - /** - * Path to pre-built index with malicious patterns - */ - indexPath?: string; - /** * Similarity threshold (0-1) for considering a match * @default 0.85 */ threshold?: number; + + /** + * Top K results to consider + * @default 5 + */ + topK?: number; + + /** + * Model for embeddings + * @default Uses the modelId from LocalLlmConfig or 'Xenova/all-MiniLM-L6-v2' + */ + modelName?: string; } /** @@ -430,6 +457,11 @@ export interface LocalLlmConfig { /** * Configuration for similarity mode (VectoriaDB) * Required when mode='similarity' + * + * Requires the `vectoriadb` package to be installed: + * ```bash + * npm install vectoriadb + * ``` */ vectoriaConfig?: VectoriaConfigForScoring; diff --git a/libs/vectoriadb/.spec.swcrc b/libs/vectoriadb/.spec.swcrc deleted file mode 100644 index f130e22..0000000 --- a/libs/vectoriadb/.spec.swcrc +++ /dev/null @@ -1,31 +0,0 @@ -{ - "jsc": { - "target": "es2017", - "parser": { - "syntax": "typescript", - "decorators": true, - "dynamicImport": true - }, - "transform": { - "decoratorMetadata": true, - "legacyDecorator": true - }, - "keepClassNames": true, - "externalHelpers": true, - "loose": true - }, - "module": { - "type": "commonjs", - "strict": true, - "strictMode": true, - "lazy": false, - "noInterop": false - }, - "sourceMaps": true, - "exclude": [ - "jest", - "./node_modules/", - "\\.spec\\.ts$", - "\\.test\\.ts$" - ] -} diff --git a/libs/vectoriadb/CHANGELOG.md b/libs/vectoriadb/CHANGELOG.md deleted file mode 100644 index 1b54610..0000000 --- a/libs/vectoriadb/CHANGELOG.md +++ /dev/null @@ -1,41 +0,0 @@ -# Changelog - -All notable changes to `vectoriadb` will be documented in this file. - -The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), -and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). - -## [Unreleased] - -## [2.0.2] - 2026-01-07 - -### Fixed - -- Regex vulnerability detection now trims analyzed patterns to 500 characters and evaluates them through bounded helper regexes to avoid the analyzer triggering ReDoS. -- Redis namespace sanitization bounds the namespace prior to regex processing and swaps in precompiled `SAFE_PATTERNS` to strip control characters and unsafe symbols deterministically. - -### Security - -- Hardened both the regex analyzer and namespace sanitizer so untrusted input can no longer leverage those code paths for ReDoS attacks. - -## [2.0.0] - 2025-12-12 - -### Added - -- Added EmbeddingService.setTransformersModule() and clearTransformersModule() to allow injecting custom transformer pipelines (primarily for testing). - -### Changed - -- Transformer embeddings now lazy-load @huggingface/transformers and emit a ConfigurationError with installation guidance when the package is not installed. -- @huggingface/transformers is now distributed as an optional peer dependency and must be added explicitly when using transformer embeddings. - -## [1.0.0] - 2025-11-30 - -### Added - -- Initial release -- In-memory vector database for semantic search -- Cosine similarity and Euclidean distance metrics -- Namespace support for multi-tenant use cases -- Configurable embedding dimensions -- Efficient nearest neighbor search diff --git a/libs/vectoriadb/README.md b/libs/vectoriadb/README.md deleted file mode 100644 index cac6f51..0000000 --- a/libs/vectoriadb/README.md +++ /dev/null @@ -1,999 +0,0 @@ -# VectoriaDB - -[![npm version](https://img.shields.io/npm/v/vectoriadb.svg)](https://www.npmjs.com/package/vectoriadb) -[![License](https://img.shields.io/badge/license-Apache--2.0-blue.svg)](LICENSE) -[![TypeScript](https://img.shields.io/badge/TypeScript-5.0+-blue.svg)](https://www.typescriptlang.org/) - -> A lightweight, production-ready in-memory vector database for semantic search in JavaScript/TypeScript - -VectoriaDB is a fast, minimal-dependency vector database designed for in-memory semantic search. Powered by [transformers.js](https://github.com/xenova/transformers.js), it's perfect for applications that need to quickly search through documents, tools, or any text-based data using natural language queries. - -## Table of Contents - -- [Features](#features) -- [Installation](#installation) -- [Why VectoriaDB?](#why-vectoriadb) -- [Quick Start](#quick-start) -- [Core Concepts](#core-concepts) -- [API Reference](#api-reference) -- [Advanced Usage](#advanced-usage) -- [Error Handling](#error-handling) -- [Performance](#performance) -- [Use Cases](#use-cases) -- [Testing](#testing) -- [Comparison](#comparison-with-other-vector-databases) -- [Limitations](#limitations) -- [Roadmap](#roadmap) -- [Contributing](#contributing) -- [License](#license) - -## Features - -- **🚀 Fast**: In-memory storage with optimized HNSW indexing for O(log n) search -- **🪶 Lightweight**: Minimal dependencies, small footprint -- **🔍 Semantic Search**: Natural language queries using state-of-the-art embeddings -- **🎯 Type-Safe**: Full TypeScript support with generics -- **⚡ Batch Operations**: Efficient bulk insert and search -- **🔧 Flexible Filtering**: Custom metadata filtering with type safety -- **📊 Scalable**: HNSW index for 100k+ documents with sub-millisecond search -- **💾 Persistent**: File & Redis adapters for caching across restarts -- **🔄 Smart Updates**: Incremental updates without re-embedding (instant metadata updates) -- **🛡️ Production-Ready Error Handling**: Typed error classes with specific error codes -- **📦 Battle-Tested**: Used in production at FrontMCP - -## Installation - -```bash -npm install vectoriadb -# or -yarn add vectoriadb -# or -pnpm add vectoriadb -``` - -**Requirements:** - -- Node.js 18+ (for transformers.js compatibility) -- TypeScript 5.0+ (if using TypeScript) - -## Why VectoriaDB? - -**Use VectoriaDB when you need:** - -- 🎯 **Semantic search** without complex infrastructure (no external services required) -- ⚡ **Fast in-memory search** with HNSW indexing (handles 100k+ documents) -- 🔒 **Privacy-first** - all embeddings generated locally, no API calls -- 🚀 **Production-ready** vector search with minimal setup -- 📦 **Embedded search** in Node.js applications, CLIs, or desktop apps - -**Skip VectoriaDB if you need:** - -- 💾 Persistent storage (use Pinecone, Weaviate, or Qdrant) -- 🌐 Distributed architecture (use Weaviate or Milvus) -- 📊 Multi-million document scale (use specialized distributed vector DBs) - -## Quick Start - -```typescript -import { VectoriaDB } from 'vectoriadb'; - -// Create and initialize the database -const db = new VectoriaDB(); -await db.initialize(); - -// Add documents -await db.add('doc-1', 'How to create a user account', { - id: 'doc-1', - category: 'auth', - author: 'Alice', -}); - -await db.add('doc-2', 'Send email notifications to users', { - id: 'doc-2', - category: 'notifications', - author: 'Bob', -}); - -// Search -const results = await db.search('creating new accounts'); -console.log(results[0].metadata); // { id: 'doc-1', category: 'auth', ... } -console.log(results[0].score); // 0.87 -``` - -## Core Concepts - -### Documents - -Each document in VectoriaDB consists of: - -- **id**: Unique identifier -- **text**: The text content to search -- **metadata**: Custom metadata (type-safe with generics) - -### Embeddings - -VectoriaDB automatically generates embeddings (vector representations) of your documents using transformers.js. The default model is `Xenova/all-MiniLM-L6-v2` (22MB, 384 dimensions), which provides a great balance of size, speed, and accuracy. - -### Search - -Search uses cosine similarity to find the most semantically similar documents to your query. - -## API Reference - -### Constructor - -```typescript -const db = new VectoriaDB(config?) -``` - -**Config Options:** - -```typescript -interface VectoriaConfig { - modelName?: string; // Default: 'Xenova/all-MiniLM-L6-v2' - dimensions?: number; // Auto-detected from model - defaultSimilarityThreshold?: number; // Default: 0.3 - defaultTopK?: number; // Default: 10 -} -``` - -### Methods - -#### `initialize(): Promise` - -Initialize the embedding model. Must be called before using the database. - -```typescript -await db.initialize(); -``` - -#### `add(id: string, text: string, metadata: T): Promise` - -Add a single document to the database. - -```typescript -await db.add('doc-1', 'Document content', { id: 'doc-1', category: 'tech' }); -``` - -#### `addMany(documents: Array<{id, text, metadata}>): Promise` - -Add multiple documents in batch (more efficient). - -```typescript -await db.addMany([ - { id: 'doc-1', text: 'Content 1', metadata: { id: 'doc-1' } }, - { id: 'doc-2', text: 'Content 2', metadata: { id: 'doc-2' } }, -]); -``` - -#### `search(query: string, options?): Promise[]>` - -Search for documents using semantic similarity. - -```typescript -const results = await db.search('machine learning', { - topK: 5, // Return top 5 results - threshold: 0.5, // Minimum similarity score - filter: (metadata) => metadata.category === 'tech', // Custom filter - includeVector: false, // Include vector in results -}); -``` - -#### `get(id: string): DocumentEmbedding | undefined` - -Get a document by ID. - -```typescript -const doc = db.get('doc-1'); -``` - -#### `has(id: string): boolean` - -Check if a document exists. - -```typescript -if (db.has('doc-1')) { - // Document exists -} -``` - -#### `remove(id: string): boolean` - -Remove a document. - -```typescript -db.remove('doc-1'); -``` - -#### `removeMany(ids: string[]): number` - -Remove multiple documents. - -```typescript -const removed = db.removeMany(['doc-1', 'doc-2']); -``` - -#### `clear(): void` - -Remove all documents. - -```typescript -db.clear(); -``` - -#### `size(): number` - -Get the number of documents. - -```typescript -const count = db.size(); -``` - -#### `filter(filterFn): DocumentEmbedding[]` - -Get documents by filter (without semantic search). - -```typescript -const techDocs = db.filter((metadata) => metadata.category === 'tech'); -``` - -#### `getStats(): VectoriaStats` - -Get database statistics. - -```typescript -const stats = db.getStats(); -console.log(stats.totalEmbeddings); -console.log(stats.estimatedMemoryBytes); -``` - -## Advanced Usage - -### Type-Safe Metadata - -Use TypeScript generics for type-safe metadata: - -```typescript -interface MyMetadata extends DocumentMetadata { - id: string; - category: 'tech' | 'business' | 'science'; - author: string; - tags: string[]; -} - -const db = new VectoriaDB(); - -await db.add('doc-1', 'Content', { - id: 'doc-1', - category: 'tech', // Type-checked! - author: 'Alice', - tags: ['ai', 'ml'], -}); - -const results = await db.search('query', { - filter: (metadata) => { - // metadata is fully typed! - return metadata.category === 'tech' && metadata.tags.includes('ai'); - }, -}); -``` - -### Custom Embedding Models - -Use any Hugging Face model compatible with transformers.js: - -```typescript -const db = new VectoriaDB({ - modelName: 'Xenova/paraphrase-multilingual-MiniLM-L12-v2', // Multilingual support -}); -``` - -### Batch Operations - -For better performance with large datasets: - -```typescript -const documents = [ - { id: '1', text: 'Doc 1', metadata: { id: '1' } }, - { id: '2', text: 'Doc 2', metadata: { id: '2' } }, - // ... thousands more -]; - -// Much faster than calling add() in a loop -await db.addMany(documents); -``` - -### HNSW Index for Production Scale - -For production applications with large datasets (>10k documents), enable HNSW (Hierarchical Navigable Small World) indexing for faster approximate nearest neighbor search: - -```typescript -const db = new VectoriaDB({ - useHNSW: true, - hnsw: { - M: 16, // Max connections per node (higher = better recall, more memory) - M0: 32, // Max connections at layer 0 - efConstruction: 200, // Construction quality (higher = better quality, slower build) - efSearch: 50, // Search quality (higher = better recall, slower search) - }, -}); - -await db.initialize(); - -// Add documents - HNSW index is built automatically -await db.addMany(documents); - -// Search uses HNSW for O(log n) instead of O(n) complexity -const results = await db.search('query'); -``` - -**HNSW Benefits:** - -- **Speed**: O(log n) search vs O(n) brute-force -- **Scalability**: Handles 100k+ documents efficiently -- **Accuracy**: >95% recall with proper tuning -- **Production-Ready**: Battle-tested algorithm used by major vector databases - -**Parameter Tuning:** - -| Parameter | Lower Value | Higher Value | Default | -| -------------- | --------------------------- | ---------------------------- | ------- | -| M | Faster build, less memory | Better recall, more memory | 16 | -| efConstruction | Faster build, lower quality | Better quality, slower build | 200 | -| efSearch | Faster search, lower recall | Better recall, slower search | 50 | - -**When to use HNSW:** - -- ✅ Dataset > 10,000 documents -- ✅ Search latency is critical -- ✅ Have memory for the graph structure (~50-100 bytes per document per connection) -- ❌ Dataset < 1,000 documents (overhead not worth it) -- ❌ Need exact nearest neighbors (HNSW is approximate) - -### Complex Filtering - -Combine semantic search with complex metadata filters: - -```typescript -interface SecurityMetadata extends DocumentMetadata { - id: string; - category: string; - tags: string[]; - author: string; - priority: 'low' | 'medium' | 'high'; -} - -const db = new VectoriaDB(); - -const results = await db.search('user authentication', { - topK: 10, - threshold: 0.4, - filter: (metadata) => { - return ( - metadata.category === 'security' && - metadata.tags.includes('auth') && - metadata.author === 'security-team' && - metadata.priority === 'high' - ); - }, -}); -``` - -### Persistence with Storage Adapters - -Cache embeddings across restarts to avoid recalculation. VectoriaDB supports multiple storage backends: - -#### In-Memory (Default) - -No persistence - data is lost on restart: - -```typescript -const db = new VectoriaDB(); // Uses MemoryStorageAdapter by default -``` - -#### File-Based Persistence - -Perfect for local development - caches to disk with automatic invalidation when tools change: - -```typescript -import { VectoriaDB, FileStorageAdapter, SerializationUtils } from 'vectoriadb'; - -const documents = [ - { id: 'tool-1', text: 'Create user account', metadata: { id: 'tool-1' } }, - { id: 'tool-2', text: 'Send email notification', metadata: { id: 'tool-2' } }, -]; - -// Create tools hash for cache invalidation -const toolsHash = SerializationUtils.createToolsHash(documents); - -const db = new VectoriaDB({ - storageAdapter: new FileStorageAdapter({ - cacheDir: './.cache/vectoriadb', - namespace: 'my-app', // Separate cache per namespace - }), - toolsHash, // Cache invalidated when tools change - version: '1.0.0', // Cache invalidated when version changes -}); - -await db.initialize(); // Automatically loads from cache if valid - -// Add documents (only on first run or after invalidation) -if (db.size() === 0) { - await db.addMany(documents); - await db.saveToStorage(); // Manually save to cache -} - -// Subsequent runs will load from cache instantly -``` - -#### Redis for Distributed Caching - -Share embeddings across pods in distributed environments: - -```typescript -import { VectoriaDB, RedisStorageAdapter, SerializationUtils } from 'vectoriadb'; -import Redis from 'ioredis'; // or your Redis client - -const documents = [ - /* your documents */ -]; -const toolsHash = SerializationUtils.createToolsHash(documents); - -const redis = new Redis({ - host: 'localhost', - port: 6379, -}); - -const db = new VectoriaDB({ - storageAdapter: new RedisStorageAdapter({ - client: redis, - namespace: 'my-app-v1', // Namespace by app + version - ttl: 86400, // 24 hours (default) - }), - toolsHash, - version: process.env.APP_VERSION, -}); - -await db.initialize(); // Loads from Redis if cache is valid - -if (db.size() === 0) { - await db.addMany(documents); - await db.saveToStorage(); -} - -// Don't forget to close when shutting down -await db.close(); -``` - -**Cache Invalidation:** - -The cache is automatically invalidated when: - -- `toolsHash` changes (documents added/removed/modified) -- `version` changes (application version updated) -- `modelName` changes (different embedding model) - -**Best Practices:** - -- **Local dev**: Use `FileStorageAdapter` to speed up restarts -- **Production**: Use `RedisStorageAdapter` for multi-pod deployments -- **Tools hash**: Create from document IDs + texts for automatic invalidation -- **Namespace**: Use app name + version to prevent cache conflicts -- **Manual save**: Call `saveToStorage()` after adding documents - -### Incremental Updates (Production-Ready) - -Update documents efficiently without re-embedding when only metadata changes: - -#### Update Metadata Only (Instant) - -```typescript -// Update metadata without re-embedding (instant operation) -db.updateMetadata('doc-1', { - id: 'doc-1', - category: 'updated-category', - priority: 'high', - lastModified: new Date(), -}); -``` - -#### Smart Update (Auto-Detection) - -```typescript -// Only re-embeds if text actually changed -const reembedded = await db.update('doc-1', { - text: 'Updated content', // If different, will re-embed - metadata: { id: 'doc-1', category: 'updated' }, -}); - -console.log(reembedded); // true if re-embedded, false if text was same -``` - -#### Batch Updates (Efficient) - -```typescript -// Update many documents - only re-embeds those with text changes -const result = await db.updateMany([ - { - id: 'doc-1', - text: 'New content for doc 1', // Will re-embed - metadata: { id: 'doc-1', category: 'tech' }, - }, - { - id: 'doc-2', - metadata: { id: 'doc-2', category: 'food' }, // No text = no re-embedding - }, - { - id: 'doc-3', - text: 'Same text as before', // Smart detection = no re-embedding - metadata: { id: 'doc-3', category: 'science' }, - }, -]); - -console.log(`Updated ${result.updated} documents`); -console.log(`Re-embedded ${result.reembedded} documents`); // Only what changed -``` - -#### Force Re-Embedding - -```typescript -// Force re-embed even if text hasn't changed (e.g., new embedding model) -await db.update('doc-1', { text: 'same text' }, { forceReembed: true }); - -// Force re-embed all in batch -await db.updateMany(docs, { forceReembed: true }); -``` - -**Performance Benefits:** - -| Operation | Speed | Re-embedding | -| ---------------------- | ---------- | ----------------- | -| `updateMetadata()` | Instant | Never | -| `update()` (metadata) | Instant | No | -| `update()` (text) | ~100-200ms | Only if changed | -| `updateMany()` (mixed) | Batched | Only what changed | - -**Use Cases:** - -- **Metadata updates**: Change categories, tags, priorities instantly -- **Partial text updates**: Only re-embed documents that actually changed -- **Dynamic content**: Update frequently changing metadata without performance hit -- **Bulk operations**: Efficiently update thousands of documents - -## Error Handling - -VectoriaDB provides production-ready error handling with specific error types that can be caught and handled individually. - -### Error Classes - -All errors extend the base `VectoriaError` class with a `code` property for programmatic error handling: - -```typescript -import { - VectoriaError, // Base error class - VectoriaNotInitializedError, // DB not initialized - DocumentValidationError, // Invalid document data - DocumentNotFoundError, // Document doesn't exist - DocumentExistsError, // Document already exists - DuplicateDocumentError, // Duplicate in batch or existing - QueryValidationError, // Invalid search query/params - EmbeddingError, // Embedding generation failure - StorageError, // Storage operation failure - ConfigurationError, // Invalid configuration -} from 'vectoriadb'; -``` - -### Error Types - -#### VectoriaNotInitializedError - -Thrown when operations are attempted before calling `initialize()`: - -```typescript -const db = new VectoriaDB(); - -try { - await db.add('doc-1', 'text', { id: 'doc-1' }); -} catch (error) { - if (error instanceof VectoriaNotInitializedError) { - console.log(error.code); // 'NOT_INITIALIZED' - console.log(error.message); // 'VectoriaDB must be initialized before adding documents...' - await db.initialize(); // Fix: initialize first - } -} -``` - -#### DocumentValidationError - -Thrown when document data is invalid: - -```typescript -try { - // Empty text - await db.add('doc-1', '', { id: 'doc-1' }); -} catch (error) { - if (error instanceof DocumentValidationError) { - console.log(error.code); // 'DOCUMENT_VALIDATION_ERROR' - console.log(error.documentId); // 'doc-1' - } -} - -try { - // Metadata.id mismatch - await db.add('doc-1', 'text', { id: 'doc-2' }); -} catch (error) { - if (error instanceof DocumentValidationError) { - console.log(error.message); // 'Metadata id "doc-2" does not match document id "doc-1"' - } -} -``` - -#### DocumentNotFoundError - -Thrown when attempting to update a non-existent document: - -```typescript -try { - await db.update('nonexistent', { text: 'new' }); -} catch (error) { - if (error instanceof DocumentNotFoundError) { - console.log(error.code); // 'DOCUMENT_NOT_FOUND' - console.log(error.documentId); // 'nonexistent' - } -} -``` - -#### DocumentExistsError - -Thrown when adding a document with an ID that already exists: - -```typescript -await db.add('doc-1', 'text', { id: 'doc-1' }); - -try { - await db.add('doc-1', 'duplicate', { id: 'doc-1' }); -} catch (error) { - if (error instanceof DocumentExistsError) { - console.log(error.code); // 'DOCUMENT_EXISTS' - console.log(error.documentId); // 'doc-1' - // Fix: use remove() first or choose different ID - db.remove('doc-1'); - await db.add('doc-1', 'duplicate', { id: 'doc-1' }); - } -} -``` - -#### DuplicateDocumentError - -Thrown when batch operations contain duplicates: - -```typescript -try { - await db.addMany([ - { id: 'doc-1', text: 'first', metadata: { id: 'doc-1' } }, - { id: 'doc-1', text: 'second', metadata: { id: 'doc-1' } }, // Duplicate in batch - ]); -} catch (error) { - if (error instanceof DuplicateDocumentError) { - console.log(error.code); // 'DUPLICATE_DOCUMENT' - console.log(error.context); // 'batch' or 'existing' - console.log(error.documentId); // 'doc-1' - } -} -``` - -#### QueryValidationError - -Thrown when search parameters are invalid: - -```typescript -try { - await db.search(''); // Empty query -} catch (error) { - if (error instanceof QueryValidationError) { - console.log(error.code); // 'QUERY_VALIDATION_ERROR' - } -} - -try { - await db.search('query', { topK: -5 }); // Invalid topK -} catch (error) { - if (error instanceof QueryValidationError) { - console.log(error.message); // 'topK must be a positive number' - } -} - -try { - await db.search('query', { threshold: 1.5 }); // Invalid threshold -} catch (error) { - if (error instanceof QueryValidationError) { - console.log(error.message); // 'threshold must be between 0 and 1' - } -} -``` - -#### EmbeddingError - -Thrown when embedding generation fails: - -```typescript -try { - // This would only happen with internal errors - await db.addMany(documents); -} catch (error) { - if (error instanceof EmbeddingError) { - console.log(error.code); // 'EMBEDDING_ERROR' - console.log(error.details); // Additional error details - } -} -``` - -### Production Error Handling Patterns - -#### Catch Specific Errors - -```typescript -try { - await db.add('doc-1', text, metadata); -} catch (error) { - if (error instanceof DocumentExistsError) { - // Handle duplicate: maybe update instead - await db.update(error.documentId, { text, metadata }); - } else if (error instanceof DocumentValidationError) { - // Handle validation: log and skip - console.error(`Invalid document ${error.documentId}:`, error.message); - } else if (error instanceof VectoriaNotInitializedError) { - // Handle initialization: retry after init - await db.initialize(); - await db.add('doc-1', text, metadata); - } else { - // Unknown error: rethrow - throw error; - } -} -``` - -#### Catch by Error Code - -```typescript -try { - await db.search(query); -} catch (error) { - if (error instanceof VectoriaError) { - switch (error.code) { - case 'NOT_INITIALIZED': - await db.initialize(); - break; - case 'QUERY_VALIDATION_ERROR': - console.error('Invalid query:', error.message); - break; - default: - throw error; - } - } -} -``` - -#### Batch Operations with Error Recovery - -```typescript -async function addDocumentsSafely(documents: Array<{ id: string; text: string; metadata: T }>) { - try { - await db.addMany(documents); - } catch (error) { - if (error instanceof DuplicateDocumentError) { - // Remove duplicate and retry - const uniqueDocs = documents.filter((doc) => doc.id !== error.documentId); - await db.addMany(uniqueDocs); - console.warn(`Skipped duplicate: ${error.documentId}`); - } else if (error instanceof DocumentValidationError) { - // Log validation error and continue with valid documents - console.error(`Invalid document ${error.documentId}:`, error.message); - // Filter out invalid document and retry - const validDocs = documents.filter((doc) => doc.id !== error.documentId); - await db.addMany(validDocs); - } else { - throw error; // Unexpected error - } - } -} -``` - -#### Graceful Degradation - -```typescript -async function searchWithFallback(query: string) { - try { - return await db.search(query); - } catch (error) { - if (error instanceof QueryValidationError) { - // Fallback to default search - console.warn('Invalid query, using default search'); - return await db.search('default query', { threshold: 0.1 }); - } else if (error instanceof VectoriaNotInitializedError) { - // Initialize and retry - await db.initialize(); - return await db.search(query); - } - throw error; - } -} -``` - -### Error Codes Reference - -| Error Class | Code | When Thrown | -| ----------------------------- | --------------------------- | --------------------------------------- | -| `VectoriaNotInitializedError` | `NOT_INITIALIZED` | Operation before `initialize()` | -| `DocumentValidationError` | `DOCUMENT_VALIDATION_ERROR` | Empty text, metadata mismatch | -| `DocumentNotFoundError` | `DOCUMENT_NOT_FOUND` | Update/get non-existent document | -| `DocumentExistsError` | `DOCUMENT_EXISTS` | Add document with existing ID | -| `DuplicateDocumentError` | `DUPLICATE_DOCUMENT` | Duplicate in batch or existing document | -| `QueryValidationError` | `QUERY_VALIDATION_ERROR` | Empty query, invalid topK/threshold | -| `EmbeddingError` | `EMBEDDING_ERROR` | Embedding generation failure | -| `StorageError` | `STORAGE_ERROR` | Storage operation failure | -| `ConfigurationError` | `CONFIGURATION_ERROR` | Invalid configuration | - -### Best Practices - -1. **Always catch specific errors** instead of generic `Error` -2. **Use error codes** for programmatic handling -3. **Access error properties** (`documentId`, `context`, etc.) for debugging -4. **Implement retry logic** for `VectoriaNotInitializedError` -5. **Log validation errors** with context for debugging -6. **Graceful fallbacks** for production resilience - -## Performance - -### Memory Usage - -Memory efficient with Float32 arrays: - -- **Embeddings**: ~1.5KB per document (384 dimensions × 4 bytes) -- **Metadata**: ~1KB per document (estimated) - -**Example**: 10,000 documents ≈ 25 MB - -### Search Speed - -**Without HNSW (brute-force):** - -- **Complexity**: O(n) where n = number of documents -- **Performance**: <10ms for 10,000 documents on modern hardware -- **Best for**: <10,000 documents - -**With HNSW (approximate nearest neighbor):** - -- **Complexity**: O(log n) approximate search -- **Performance**: Sub-millisecond for 100,000+ documents -- **Accuracy**: >95% recall with default parameters -- **Best for**: >10,000 documents - -### Embedding Generation - -- **Model**: Xenova/all-MiniLM-L6-v2 (22MB) -- **Speed**: ~100-200 embeddings/second (hardware dependent) -- **Batch optimization**: 32 documents per batch - -## Use Cases - -### 1. Tool Discovery - -```typescript -interface ToolMetadata extends DocumentMetadata { - id: string; - toolName: string; - category: string; -} - -const db = new VectoriaDB(); -await db.initialize(); - -await db.addMany([ - { id: 'tool-1', text: 'Create user accounts', metadata: { id: 'tool-1', toolName: 'create_user', category: 'auth' } }, - { id: 'tool-2', text: 'Send emails', metadata: { id: 'tool-2', toolName: 'send_email', category: 'notification' } }, -]); - -const results = await db.search('how to add new users'); -// Returns: [{ metadata: { toolName: 'create_user', ... }, score: 0.89 }] -``` - -### 2. Documentation Search - -```typescript -interface DocMetadata extends DocumentMetadata { - id: string; - title: string; - section: string; - url: string; -} - -const db = new VectoriaDB(); -// Add documentation pages -// Search with natural language -``` - -### 3. Product Search - -```typescript -interface ProductMetadata extends DocumentMetadata { - id: string; - name: string; - category: string; - price: number; -} - -const db = new VectoriaDB(); -// Add products with descriptions -// Search: "affordable wireless headphones" -``` - -## Testing - -VectoriaDB comes with comprehensive tests covering all major functionality: - -```bash -# Run tests -npm test - -# Run tests with coverage -npm run test:coverage -``` - -The test suite includes: - -- **Embedding Service Tests**: Verify embedding generation and model initialization -- **Vector Database Tests**: Test CRUD operations, search, and filtering -- **Similarity Tests**: Validate cosine similarity calculations - -All tests use mocked transformers.js to avoid downloading models during CI/CD, making tests fast and reliable. - -## Comparison with Other Vector Databases - -| Feature | VectoriaDB | Pinecone | Weaviate | ChromaDB | -| -------------------- | ---------- | -------- | -------- | -------- | -| **In-memory** | ✅ | ❌ | ❌ | ✅ | -| **Lightweight** | ✅ (22MB) | ❌ | ❌ | ⚠️ | -| **Type-safe** | ✅ | ⚠️ | ⚠️ | ⚠️ | -| **Zero config** | ✅ | ❌ | ❌ | ✅ | -| **Production-ready** | ✅ | ✅ | ✅ | ✅ | -| **Persistence** | ❌ | ✅ | ✅ | ✅ | -| **Distributed** | ❌ | ✅ | ✅ | ❌ | - -VectoriaDB is ideal for: - -- **Small to medium datasets** (<100k documents) -- **Fast in-memory search** without external dependencies -- **Embedded applications** that need semantic search -- **Development and testing** before scaling to production DBs - -## Limitations - -1. **Single process**: Not distributed (use Redis adapter for multi-pod setups) -2. **HNSW is approximate**: ~95% recall vs 100% with brute-force (use brute-force for exact results) -3. **In-memory primary**: Persistence via adapters (cache strategy, not database) - -## Roadmap - -- [x] Comprehensive test suite with mocked dependencies -- [x] HNSW indexing for faster search (>100k documents) -- [x] Persistence adapters (Redis, File, Memory) -- [x] Incremental updates without re-embedding -- [x] Production-ready error handling with typed exceptions -- [ ] Compression for stored embeddings -- [ ] Multi-vector embeddings per document - -## Contributing - -Contributions are welcome! Please see [CONTRIBUTING.md](../../CONTRIBUTING.md) for details. - -## License - -Apache-2.0 - -## Credits - -Built with: - -- [transformers.js](https://github.com/xenova/transformers.js) by Xenova -- Part of the [FrontMCP](https://github.com/agentfront/frontmcp) ecosystem diff --git a/libs/vectoriadb/TESTING.md b/libs/vectoriadb/TESTING.md deleted file mode 100644 index c3aad56..0000000 --- a/libs/vectoriadb/TESTING.md +++ /dev/null @@ -1,331 +0,0 @@ -# VectoriaDB Testing Guide - -## Test Suite Overview - -VectoriaDB includes comprehensive tests covering all functionality with **100% API coverage**. - -## Quick Start - -```bash -# Run all tests -nx test vectoriadb - -# Run with coverage report -nx test vectoriadb --coverage - -# Run in watch mode (for development) -nx test vectoriadb --watch - -# Run specific test file -nx test vectoriadb --testFile=similarity.test.ts -``` - -## Test Files - -### 1. `similarity.test.ts` (Fast - No Model Required) - -Tests vector similarity utilities: - -- ✅ 15 tests covering cosine similarity, normalization, euclidean distance, dot product -- ⚡ **Fast**: < 1 second -- 🎯 **Zero dependencies**: Pure math calculations - -**Run individually:** - -```bash -nx test vectoriadb --testFile=similarity.test.ts -``` - -### 2. `embedding.test.ts` (Slow - Downloads Model First Time) - -Tests embedding generation service: - -- ✅ Tests for model initialization, single/batch embeddings, custom models -- ⏱️ **First run**: ~30-60 seconds (downloads 22MB model) -- ⚡ **Subsequent runs**: ~5-10 seconds (uses cached model) - -**Important**: First run downloads the model to `.cache/transformers/` - -### 3. `vectoria.test.ts` (Slow - Requires Model) - -Tests main VectoriaDB functionality: - -- ✅ Comprehensive tests for all CRUD operations, search, filtering -- ⏱️ **Runtime**: ~10-20 seconds -- 🔍 **Coverage**: All public API methods, edge cases, error handling - -## Test Results - -```text -PASS vectoriadb similarity.test.ts - Similarity Utils - cosineSimilarity - ✓ should return 1 for identical vectors - ✓ should return 0 for orthogonal vectors - ✓ should return -1 for opposite vectors - ✓ should handle similar but not identical vectors - ✓ should throw error for vectors of different dimensions - ✓ should return 0 for zero vectors - normalizeVector - ✓ should normalize a vector to unit length - ✓ should handle already normalized vector - ✓ should handle zero vector - euclideanDistance - ✓ should calculate distance between identical vectors as 0 - ✓ should calculate distance correctly - ✓ should throw error for vectors of different dimensions - dotProduct - ✓ should calculate dot product correctly - ✓ should return 0 for orthogonal vectors - ✓ should throw error for vectors of different dimensions - -Test Suites: 1 passed, 1 total -Tests: 15 passed, 15 total -``` - -## First-Time Setup - -The first test run will: - -1. Download the embedding model (~22MB) -2. Cache it in `./.cache/transformers/` -3. Take ~30-60 seconds - -**Subsequent runs use the cached model and are much faster.** - -## CI/CD Integration - -### GitHub Actions Example - -```yaml -name: Test VectoriaDB - -on: [push, pull_request] - -jobs: - test: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - - - name: Setup Node.js - uses: actions/setup-node@v3 - with: - node-version: '24' - - # Cache the transformer models - - name: Cache Transformers Models - uses: actions/cache@v3 - with: - path: .cache/transformers - key: transformers-${{ runner.os }}-${{ hashFiles('libs/vectoriadb/package.json') }} - restore-keys: | - transformers-${{ runner.os }}- - - - name: Install dependencies - run: yarn install - - - name: Run tests - run: nx test vectoriadb --coverage - - - name: Upload coverage - uses: codecov/codecov-action@v3 - with: - files: ./coverage/libs/vectoriadb/lcov.info -``` - -### Pre-download Model (Optional) - -For faster CI builds, pre-download the model: - -```bash -# Add to your CI setup script -node -e " - import('@huggingface/transformers').then(async ({ pipeline }) => { - await pipeline('feature-extraction', 'Xenova/all-MiniLM-L6-v2'); - console.log('Model cached successfully'); - }); -" -``` - -## Coverage - -Run with coverage report: - -```bash -nx test vectoriadb --coverage -``` - -Expected coverage: - -- **Statements**: > 95% -- **Branches**: > 90% -- **Functions**: > 95% -- **Lines**: > 95% - -Coverage report is generated in `coverage/libs/vectoriadb/` - -## Debugging Tests - -### Verbose output - -```bash -nx test vectoriadb --verbose -``` - -### Run specific test by name - -```bash -nx test vectoriadb -t "should add a document" -``` - -### Debug mode - -```bash -node --inspect-brk node_modules/.bin/nx test vectoriadb -``` - -Then open `chrome://inspect` in Chrome and click "inspect". - -## Writing New Tests - -### Template - -```typescript -import { VectoriaDB } from '../vectoria'; -import { DocumentMetadata } from '../interfaces'; - -interface TestMetadata extends DocumentMetadata { - id: string; - category: string; -} - -describe('MyFeature', () => { - let db: VectoriaDB; - - beforeAll(async () => { - db = new VectoriaDB(); - await db.initialize(); - }, 60000); // Timeout for model loading - - afterEach(() => { - db.clear(); // Clean up between tests - }); - - test('should do something', async () => { - await db.add('doc-1', 'content', { - id: 'doc-1', - category: 'test', - }); - - const results = await db.search('query'); - expect(results.length).toBeGreaterThan(0); - }); -}); -``` - -### Best Practices - -1. **Use `beforeAll` for initialization** - Don't re-initialize the model for each test -2. **Use `afterEach` to clear data** - Keep tests independent -3. **Set appropriate timeouts** - Model loading tests need 60s timeout -4. **Type your metadata** - Use TypeScript generics for type safety -5. **Test edge cases** - Empty inputs, non-existent IDs, etc. - -## Troubleshooting - -### Model Download Fails - -**Problem**: Model download times out or fails - -**Solution**: - -```bash -# Delete cache and retry -rm -rf .cache/transformers -nx test vectoriadb -``` - -### Out of Memory - -**Problem**: Tests fail with "JavaScript heap out of memory" - -**Solution**: - -```bash -# Increase Node.js memory -NODE_OPTIONS=--max-old-space-size=4096 nx test vectoriadb -``` - -### Tests Hang - -**Problem**: Tests hang during initialization - -**Solution**: - -1. Check your internet connection (model download required) -2. Increase timeout in `jest.config.ts` -3. Check firewall settings (Hugging Face CDN access needed) - -### Import Errors - -**Problem**: `Cannot find module '@huggingface/transformers'` - -**Solution**: - -```bash -cd libs/vectoriadb -yarn install -``` - -## Performance Benchmarks - -| Test Suite | Tests | Time (Cached) | Time (First Run) | -| ------------------ | ------- | ------------- | ---------------- | -| similarity.test.ts | 15 | < 1s | < 1s | -| embedding.test.ts | ~10 | 5-10s | 30-60s | -| vectoria.test.ts | ~40 | 10-20s | 30-60s | -| **Total** | **~65** | **15-30s** | **60-120s** | - -## Continuous Testing - -For development with live reload: - -```bash -nx test vectoriadb --watch -``` - -This will: - -- ✅ Re-run tests on file changes -- ✅ Show only failed tests after first run -- ✅ Provide interactive mode for focused testing - -Press `p` to filter by test file name -Press `t` to filter by test name -Press `q` to quit - -## Test Configuration - -Tests are configured in: - -- `jest.config.ts` - Jest configuration -- `.spec.swcrc` - SWC transpiler settings -- `project.json` - NX test target - -## Next Steps - -After tests pass: - -1. Build the library: `nx build vectoriadb` -2. Run integration tests (if any) -3. Check coverage report -4. Update documentation if needed - -## Support - -For test-related issues: - -- Check the [test README](src/__tests__/README.md) -- Open an issue on GitHub -- Review [Jest documentation](https://jestjs.io/) diff --git a/libs/vectoriadb/eslint.config.mjs b/libs/vectoriadb/eslint.config.mjs deleted file mode 100644 index 5348377..0000000 --- a/libs/vectoriadb/eslint.config.mjs +++ /dev/null @@ -1,38 +0,0 @@ -import baseConfig from '../../eslint.config.mjs'; - -export default [ - ...baseConfig, - { - files: ['**/*.ts', '**/*.tsx'], - rules: { - '@typescript-eslint/no-explicit-any': 'off', - '@typescript-eslint/no-non-null-assertion': 'off', - '@typescript-eslint/no-unused-vars': [ - 'warn', - { - argsIgnorePattern: '^_', - varsIgnorePattern: '^_', - caughtErrorsIgnorePattern: '^_', - }, - ], - }, - }, - { - files: ['**/*.json'], - rules: { - '@nx/dependency-checks': [ - 'error', - { - ignoredFiles: ['{projectRoot}/eslint.config.{js,cjs,mjs,ts,cts,mts}'], - ignoredDependencies: ['@huggingface/transformers'], // Optional peer dependency loaded dynamically - }, - ], - }, - languageOptions: { - parser: await import('jsonc-eslint-parser'), - }, - }, - { - ignores: ['**/out-tsc', '**/dist', '**/coverage', '**/__tests__', '**/.nx', '**/.idea', '**/.git'], - }, -]; diff --git a/libs/vectoriadb/jest.config.ts b/libs/vectoriadb/jest.config.ts deleted file mode 100644 index f43761f..0000000 --- a/libs/vectoriadb/jest.config.ts +++ /dev/null @@ -1,25 +0,0 @@ -module.exports = { - displayName: 'vectoriadb', - preset: '../../jest.preset.js', - testEnvironment: 'node', - setupFilesAfterEnv: ['/jest.setup.ts'], - transform: { - '^.+\\.[tj]s$': [ - '@swc/jest', - { - jsc: { - parser: { syntax: 'typescript' }, - target: 'es2022', - }, - }, - ], - }, - moduleFileExtensions: ['ts', 'js', 'html'], - coverageDirectory: '../../coverage/libs/vectoriadb', - coveragePathIgnorePatterns: [ - '/node_modules/', - '/src/index.ts', // Ignore index.ts (re-exports only, no logic to test) - ], - testMatch: ['**/__tests__/**/*.spec.ts'], - testTimeout: 60000, -}; diff --git a/libs/vectoriadb/jest.setup.ts b/libs/vectoriadb/jest.setup.ts deleted file mode 100644 index 72e44d6..0000000 --- a/libs/vectoriadb/jest.setup.ts +++ /dev/null @@ -1,89 +0,0 @@ -/** - * Jest setup file for vectoria tests - * Injects a mock transformers module to avoid ONNX Runtime issues in test environment - */ - -import { EmbeddingService } from './src/embedding.service'; - -// Helper to extract and normalize words from text -const extractWords = (text: string): string[] => { - return text - .toLowerCase() - .replace(/[^\w\s]/g, '') - .split(/\s+/) - .filter((word) => word.length > 2); // Filter out short words like "a", "an", "in" -}; - -// Normalize word forms to their root (simple stemming) -const normalizeWord = (word: string): string => { - // Simple stemming - remove common suffixes - return word.replace(/ing$/, '').replace(/s$/, '').replace(/ed$/, '').replace(/er$/, ''); -}; - -// Create a mock pipeline function that returns consistent embeddings -const createMockPipeline = () => { - return async (text: string | string[]) => { - const textStr = text.toString(); - const words = extractWords(textStr); - const normalizedWords = words.map(normalizeWord); - - // Create a 384-dimensional embedding (matching all-MiniLM-L6-v2) - const embedding = new Float32Array(384); - - // Each normalized word contributes to specific dimensions - // This ensures that texts with overlapping words have high similarity - normalizedWords.forEach((word) => { - // Calculate which dimensions this word affects - const wordHash = word.split('').reduce((acc, char) => acc + char.charCodeAt(0), 0); - - // Each word contributes to ~50 dimensions centered around its hash position - for (let offset = -25; offset < 25; offset++) { - const dim = (wordHash + offset) % 384; - // Use Gaussian-like contribution - const contribution = Math.exp(-(offset * offset) / 100); - embedding[dim] += contribution; - } - }); - - // Add small random-like component for uniqueness - for (let i = 0; i < 384; i++) { - // Deterministic "noise" based on text length and position - const noise = Math.sin(textStr.length * 100 + i) * 0.01; - embedding[i] += noise; - } - - // Normalize the embedding to unit length (as transformers.js does) - let norm = 0; - for (let i = 0; i < embedding.length; i++) { - norm += embedding[i] * embedding[i]; - } - norm = Math.sqrt(norm); - - if (norm > 0) { - for (let i = 0; i < embedding.length; i++) { - embedding[i] /= norm; - } - } - - return { - data: embedding, - }; - }; -}; - -// Create mock transformers module -const mockTransformersModule = { - pipeline: jest.fn(async () => { - return createMockPipeline(); - }), -}; - -// Inject mock transformers module before all tests -beforeAll(() => { - EmbeddingService.setTransformersModule(mockTransformersModule); -}); - -// Clear the mock after all tests -afterAll(() => { - EmbeddingService.clearTransformersModule(); -}); diff --git a/libs/vectoriadb/package.json b/libs/vectoriadb/package.json deleted file mode 100644 index 81d03ff..0000000 --- a/libs/vectoriadb/package.json +++ /dev/null @@ -1,51 +0,0 @@ -{ - "name": "vectoriadb", - "version": "2.0.2", - "description": "VectoriaDB - A lightweight, production-ready in-memory vector database for semantic search", - "author": "AgentFront ", - "homepage": "https://github.com/agentfront/frontmcp/tree/main/libs/vectoriadb", - "license": "Apache-2.0", - "keywords": [ - "vector-database", - "semantic-search", - "embeddings", - "in-memory", - "cosine-similarity", - "transformers", - "machine-learning", - "vector-search", - "similarity-search", - "vectordb", - "vectoria" - ], - "repository": { - "type": "git", - "url": "git+https://github.com/agentfront/enclave.git", - "directory": "libs/vectoriadb" - }, - "bugs": { - "url": "https://github.com/agentfront/enclave/issues" - }, - "main": "./dist/src/index.js", - "types": "./dist/src/index.d.ts", - "exports": { - "./package.json": "./package.json", - ".": { - "development": "./src/index.ts", - "types": "./dist/src/index.d.ts", - "import": "./dist/src/index.js", - "default": "./dist/src/index.js" - } - }, - "peerDependencies": { - "@huggingface/transformers": "^3.2.2" - }, - "peerDependenciesMeta": { - "@huggingface/transformers": { - "optional": true - } - }, - "devDependencies": { - "typescript": "^5.9.3" - } -} diff --git a/libs/vectoriadb/project.json b/libs/vectoriadb/project.json deleted file mode 100644 index 3b1c37f..0000000 --- a/libs/vectoriadb/project.json +++ /dev/null @@ -1,33 +0,0 @@ -{ - "name": "vectoriadb", - "$schema": "../../node_modules/nx/schemas/project-schema.json", - "sourceRoot": "libs/vectoriadb/src", - "projectType": "library", - "tags": ["scope:libs", "scope:publishable", "versioning:independent"], - "targets": { - "build-tsc": { - "executor": "@nx/js:tsc", - "outputs": ["{options.outputPath}"], - "options": { - "outputPath": "libs/vectoriadb/dist", - "main": "libs/vectoriadb/src/index.ts", - "tsConfig": "libs/vectoriadb/tsconfig.lib.json", - "assets": ["libs/vectoriadb/README.md", "libs/vectoriadb/CHANGELOG.md", "LICENSE"] - } - }, - "build": { - "executor": "nx:run-commands", - "dependsOn": ["build-tsc"], - "options": { - "command": "node scripts/strip-dist-from-pkg.js libs/vectoriadb/dist/package.json" - } - }, - "nx-release-publish": { - "executor": "@nx/js:release-publish", - "dependsOn": ["build"], - "options": { - "packageRoot": "libs/vectoriadb/dist" - } - } - } -} diff --git a/libs/vectoriadb/src/__tests__/embedding.spec.ts b/libs/vectoriadb/src/__tests__/embedding.spec.ts deleted file mode 100644 index 3e7bc77..0000000 --- a/libs/vectoriadb/src/__tests__/embedding.spec.ts +++ /dev/null @@ -1,327 +0,0 @@ -import * as fs from 'fs/promises'; -import { EmbeddingService, EmbeddingError } from '../index'; - -describe('EmbeddingService', () => { - let embeddingService: EmbeddingService; - - beforeAll(async () => { - embeddingService = new EmbeddingService(); - await embeddingService.initialize(); - }, 60000); // Allow time for model download - - describe('initialization', () => { - test('should initialize successfully', () => { - expect(embeddingService.isReady()).toBe(true); - }); - - test('should detect correct dimensions', () => { - const dimensions = embeddingService.getDimensions(); - expect(dimensions).toBe(384); // all-MiniLM-L6-v2 has 384 dimensions - }); - - test('should return correct model name', () => { - const modelName = embeddingService.getModelName(); - expect(modelName).toBe('Xenova/all-MiniLM-L6-v2'); - }); - - test('should allow re-initialization without error', async () => { - await expect(embeddingService.initialize()).resolves.not.toThrow(); - }); - }); - - describe('generateEmbedding', () => { - test('should generate embedding for a single text', async () => { - const embedding = await embeddingService.generateEmbedding('test text'); - - expect(embedding).toBeInstanceOf(Float32Array); - expect(embedding.length).toBe(384); - }); - - test('should generate different embeddings for different texts', async () => { - const embedding1 = await embeddingService.generateEmbedding('hello world'); - const embedding2 = await embeddingService.generateEmbedding('goodbye world'); - - expect(embedding1).not.toEqual(embedding2); - - // Check that they are actually different - let hasDifference = false; - for (let i = 0; i < embedding1.length; i++) { - if (embedding1[i] !== embedding2[i]) { - hasDifference = true; - break; - } - } - expect(hasDifference).toBe(true); - }); - - test('should generate similar embeddings for similar texts', async () => { - const embedding1 = await embeddingService.generateEmbedding('create user account'); - const embedding2 = await embeddingService.generateEmbedding('creating user accounts'); - - // Calculate cosine similarity - let dotProduct = 0; - let norm1 = 0; - let norm2 = 0; - - for (let i = 0; i < embedding1.length; i++) { - dotProduct += embedding1[i] * embedding2[i]; - norm1 += embedding1[i] * embedding1[i]; - norm2 += embedding2[i] * embedding2[i]; - } - - const similarity = dotProduct / (Math.sqrt(norm1) * Math.sqrt(norm2)); - expect(similarity).toBeGreaterThan(0.7); // Should be very similar - }); - - test('should handle empty string', async () => { - const embedding = await embeddingService.generateEmbedding(''); - - expect(embedding).toBeInstanceOf(Float32Array); - expect(embedding.length).toBe(384); - }); - - test('should handle long text', async () => { - const longText = 'This is a very long text. '.repeat(100); - const embedding = await embeddingService.generateEmbedding(longText); - - expect(embedding).toBeInstanceOf(Float32Array); - expect(embedding.length).toBe(384); - }); - }); - - describe('generateEmbeddings (batch)', () => { - test('should generate embeddings for multiple texts', async () => { - const texts = ['text 1', 'text 2', 'text 3']; - const embeddings = await embeddingService.generateEmbeddings(texts); - - expect(embeddings.length).toBe(3); - embeddings.forEach((embedding) => { - expect(embedding).toBeInstanceOf(Float32Array); - expect(embedding.length).toBe(384); - }); - }); - - test('should handle empty array', async () => { - const embeddings = await embeddingService.generateEmbeddings([]); - expect(embeddings.length).toBe(0); - }); - - test('should handle large batches', async () => { - const texts = Array.from({ length: 100 }, (_, i) => `text ${i}`); - const embeddings = await embeddingService.generateEmbeddings(texts); - - expect(embeddings.length).toBe(100); - embeddings.forEach((embedding) => { - expect(embedding).toBeInstanceOf(Float32Array); - expect(embedding.length).toBe(384); - }); - }, 120000); // Allow extra time for large batch - }); - - describe('custom model', () => { - test('should allow custom model name', async () => { - const customService = new EmbeddingService('Xenova/all-MiniLM-L6-v2'); - await customService.initialize(); - - expect(customService.getModelName()).toBe('Xenova/all-MiniLM-L6-v2'); - expect(customService.isReady()).toBe(true); - }, 60000); - }); - - describe('error handling and edge cases', () => { - test('should handle whitespace-only string', async () => { - const embedding = await embeddingService.generateEmbedding(' '); - - expect(embedding).toBeInstanceOf(Float32Array); - expect(embedding.length).toBe(384); - }); - - test('should handle very long text', async () => { - const longText = 'word '.repeat(1000); // 5000 characters - const embedding = await embeddingService.generateEmbedding(longText); - - expect(embedding).toBeInstanceOf(Float32Array); - expect(embedding.length).toBe(384); - }, 60000); - - test('should handle special characters', async () => { - const specialText = '!@#$%^&*()_+-={}[]|\\:";\'<>?,./`~'; - const embedding = await embeddingService.generateEmbedding(specialText); - - expect(embedding).toBeInstanceOf(Float32Array); - expect(embedding.length).toBe(384); - }); - - test('should handle unicode characters', async () => { - const unicodeText = '你好世界 🌍 مرحبا العالم'; - const embedding = await embeddingService.generateEmbedding(unicodeText); - - expect(embedding).toBeInstanceOf(Float32Array); - expect(embedding.length).toBe(384); - }); - - test('should handle emoji text', async () => { - const emojiText = '😀 😃 😄 😁 🎉 🎊 🎈'; - const embedding = await embeddingService.generateEmbedding(emojiText); - - expect(embedding).toBeInstanceOf(Float32Array); - expect(embedding.length).toBe(384); - }); - - test('should handle empty batch', async () => { - const embeddings = await embeddingService.generateEmbeddings([]); - expect(embeddings).toEqual([]); - }); - - test('should handle batch with one item', async () => { - const embeddings = await embeddingService.generateEmbeddings(['single text']); - - expect(embeddings.length).toBe(1); - expect(embeddings[0]).toBeInstanceOf(Float32Array); - expect(embeddings[0].length).toBe(384); - }); - - test('should handle batch with mixed empty and non-empty strings', async () => { - const embeddings = await embeddingService.generateEmbeddings(['text', '', 'more text', ' ']); - - expect(embeddings.length).toBe(4); - embeddings.forEach((embedding) => { - expect(embedding).toBeInstanceOf(Float32Array); - expect(embedding.length).toBe(384); - }); - }); - - test('should isReady return false before initialization', () => { - const service = new EmbeddingService(); - - expect(service.isReady()).toBe(false); - }); - - test('should getDimensions return correct value after initialization', async () => { - expect(embeddingService.getDimensions()).toBe(384); - }); - - test('should handle numbers as text', async () => { - const embedding = await embeddingService.generateEmbedding('123456789'); - - expect(embedding).toBeInstanceOf(Float32Array); - expect(embedding.length).toBe(384); - }); - - test('should handle repeated initialization gracefully', async () => { - const service = new EmbeddingService(); - await service.initialize(); - await service.initialize(); // Second initialization - - expect(service.isReady()).toBe(true); - }, 60000); - - test('should use custom cache directory', async () => { - const customCacheDir = './tmp/custom-cache-test'; - const service = new EmbeddingService('Xenova/all-MiniLM-L6-v2', customCacheDir); - await service.initialize(); - - expect(service.isReady()).toBe(true); - - // Cleanup - try { - await fs.rm(customCacheDir, { recursive: true, force: true }); - } catch { - // Ignore cleanup errors - } - }, 60000); - - test('should handle concurrent initialization calls', async () => { - const service = new EmbeddingService(); - - // Call initialize() multiple times concurrently - const init1 = service.initialize(); - const init2 = service.initialize(); - const init3 = service.initialize(); - - // All should complete without error - await Promise.all([init1, init2, init3]); - - expect(service.isReady()).toBe(true); - }, 60000); - }); - - describe('error handling', () => { - test('should throw EmbeddingError on generation failure', async () => { - // Create a service and force it to be initialized with a broken pipeline - const service = new EmbeddingService(); - await service.initialize(); - - // Mock the internal pipeline to fail on next call - const originalPipeline = (service as any).pipeline; - (service as any).pipeline = jest.fn().mockRejectedValue(new Error('Pipeline execution failed')); - - await expect(service.generateEmbedding('test')).rejects.toThrow('Failed to generate embedding'); - await expect(service.generateEmbedding('test')).rejects.toThrow(EmbeddingError); - - // Restore - (service as any).pipeline = originalPipeline; - }, 60000); - - test('should throw EmbeddingError on batch generation failure', async () => { - const service = new EmbeddingService(); - await service.initialize(); - - // Mock the internal pipeline to fail - const originalPipeline = (service as any).pipeline; - (service as any).pipeline = jest.fn().mockRejectedValue(new Error('Batch processing failed')); - - await expect(service.generateEmbeddings(['test1', 'test2'])).rejects.toThrow('Failed to generate embeddings'); - await expect(service.generateEmbeddings(['test1', 'test2'])).rejects.toThrow(EmbeddingError); - - // Restore - (service as any).pipeline = originalPipeline; - }, 60000); - - test('should handle generateEmbedding called before initialization', async () => { - const service = new EmbeddingService(); - - // Call generateEmbedding without calling initialize first - // It should auto-initialize - const embedding = await service.generateEmbedding('auto-init test'); - - expect(embedding).toBeInstanceOf(Float32Array); - expect(embedding.length).toBe(384); - expect(service.isReady()).toBe(true); - }, 60000); - - test('should handle generateEmbeddings called before initialization', async () => { - const service = new EmbeddingService(); - - // Call generateEmbeddings without calling initialize first - // It should auto-initialize - const embeddings = await service.generateEmbeddings(['auto-init test 1', 'auto-init test 2']); - - expect(embeddings.length).toBe(2); - embeddings.forEach((embedding) => { - expect(embedding).toBeInstanceOf(Float32Array); - expect(embedding.length).toBe(384); - }); - expect(service.isReady()).toBe(true); - }, 60000); - - test('should include original error details in EmbeddingError', async () => { - const service = new EmbeddingService(); - await service.initialize(); - - const testError = new Error('Original pipeline error with details'); - (service as any).pipeline = jest.fn().mockRejectedValue(testError); - - try { - await service.generateEmbedding('test'); - fail('Should have thrown EmbeddingError'); - } catch (error) { - expect(error).toBeInstanceOf(EmbeddingError); - expect((error as EmbeddingError).message).toContain('Failed to generate embedding'); - expect((error as EmbeddingError).message).toContain('Original pipeline error with details'); - // Verify the original error is preserved - expect((error as EmbeddingError).details).toBe(testError); - } - }, 60000); - }); -}); diff --git a/libs/vectoriadb/src/__tests__/error.utils.spec.ts b/libs/vectoriadb/src/__tests__/error.utils.spec.ts deleted file mode 100644 index c69da4b..0000000 --- a/libs/vectoriadb/src/__tests__/error.utils.spec.ts +++ /dev/null @@ -1,211 +0,0 @@ -/** - * Tests for error sanitization utilities - */ - -import { - sanitizeErrorMessage, - sanitizeDocumentId, - sanitizeFileError, - sanitizeStorageError, - GENERIC_ERROR_MESSAGES, -} from '../error.utils'; - -describe('Error Utilities', () => { - describe('sanitizeErrorMessage', () => { - it('should return error message in verbose mode with Error object', () => { - const error = new Error('Detailed error message'); - const result = sanitizeErrorMessage(error, 'Generic error', true); - - expect(result).toBe('Detailed error message'); - }); - - it('should return error message in verbose mode with string', () => { - const result = sanitizeErrorMessage('Detailed error message', 'Generic error', true); - - expect(result).toBe('Detailed error message'); - }); - - it('should return generic message in production mode with Error object', () => { - const error = new Error('Detailed error message with sensitive info'); - const result = sanitizeErrorMessage(error, 'Generic error', false); - - expect(result).toBe('Generic error'); - }); - - it('should return generic message in production mode with string', () => { - const result = sanitizeErrorMessage('Detailed error with /path/to/file', 'Generic error', false); - - expect(result).toBe('Generic error'); - }); - - it('should default to verbose mode when not specified', () => { - const error = new Error('Test error'); - const result = sanitizeErrorMessage(error, 'Generic error'); - - expect(result).toBe('Test error'); - }); - }); - - describe('sanitizeDocumentId', () => { - it('should return actual ID in verbose mode', () => { - const result = sanitizeDocumentId('doc-12345-secret', true); - - expect(result).toBe('doc-12345-secret'); - }); - - it('should return placeholder in production mode', () => { - const result = sanitizeDocumentId('doc-12345-secret', false); - - expect(result).toBe('[document]'); - }); - - it('should default to verbose mode when not specified', () => { - const result = sanitizeDocumentId('doc-12345-secret'); - - expect(result).toBe('doc-12345-secret'); - }); - }); - - describe('sanitizeFileError', () => { - it('should return detailed message in verbose mode with Error object', () => { - const error = new Error('ENOENT: file not found /path/to/file.json'); - const result = sanitizeFileError('read', error, true); - - expect(result).toBe('Failed to read file: ENOENT: file not found /path/to/file.json'); - }); - - it('should return detailed message in verbose mode with string', () => { - const result = sanitizeFileError('write', 'Permission denied /path/to/file.json', true); - - expect(result).toBe('Failed to write file: Permission denied /path/to/file.json'); - }); - - it('should return generic message in production mode with Error object', () => { - const error = new Error('ENOENT: file not found /path/to/file.json'); - const result = sanitizeFileError('read', error, false); - - expect(result).toBe('Failed to read file'); - }); - - it('should return generic message in production mode with string', () => { - const result = sanitizeFileError('write', 'Permission denied', false); - - expect(result).toBe('Failed to write file'); - }); - - it('should default to verbose mode when not specified', () => { - const error = new Error('Test error'); - const result = sanitizeFileError('delete', error); - - expect(result).toBe('Failed to delete file: Test error'); - }); - - it('should handle different operation names', () => { - const error = new Error('Test'); - - expect(sanitizeFileError('read', error, true)).toContain('read'); - expect(sanitizeFileError('write', error, true)).toContain('write'); - expect(sanitizeFileError('delete', error, true)).toContain('delete'); - expect(sanitizeFileError('move', error, true)).toContain('move'); - }); - }); - - describe('sanitizeStorageError', () => { - it('should return detailed message in verbose mode with Error object', () => { - const error = new Error('Connection refused to redis://localhost:6379'); - const result = sanitizeStorageError('connection', error, true); - - expect(result).toBe('Storage connection failed: Connection refused to redis://localhost:6379'); - }); - - it('should return detailed message in verbose mode with string', () => { - const result = sanitizeStorageError('save', 'Timeout connecting to database', true); - - expect(result).toBe('Storage save failed: Timeout connecting to database'); - }); - - it('should return generic message in production mode with Error object', () => { - const error = new Error('Connection refused to redis://localhost:6379'); - const result = sanitizeStorageError('connection', error, false); - - expect(result).toBe('Storage operation failed'); - }); - - it('should return generic message in production mode with string', () => { - const result = sanitizeStorageError('save', 'Timeout connecting', false); - - expect(result).toBe('Storage operation failed'); - }); - - it('should default to verbose mode when not specified', () => { - const error = new Error('Test error'); - const result = sanitizeStorageError('load', error); - - expect(result).toBe('Storage load failed: Test error'); - }); - - it('should handle different operation names', () => { - const error = new Error('Test'); - - expect(sanitizeStorageError('save', error, true)).toContain('save'); - expect(sanitizeStorageError('load', error, true)).toContain('load'); - expect(sanitizeStorageError('delete', error, true)).toContain('delete'); - expect(sanitizeStorageError('initialize', error, true)).toContain('initialize'); - }); - }); - - describe('GENERIC_ERROR_MESSAGES', () => { - it('should have all required generic error messages', () => { - expect(GENERIC_ERROR_MESSAGES.VALIDATION_ERROR).toBe('Validation failed'); - expect(GENERIC_ERROR_MESSAGES.DOCUMENT_NOT_FOUND).toBe('Document not found'); - expect(GENERIC_ERROR_MESSAGES.DOCUMENT_EXISTS).toBe('Document already exists'); - expect(GENERIC_ERROR_MESSAGES.DUPLICATE_DOCUMENT).toBe('Duplicate document detected'); - expect(GENERIC_ERROR_MESSAGES.QUERY_ERROR).toBe('Query validation failed'); - expect(GENERIC_ERROR_MESSAGES.EMBEDDING_ERROR).toBe('Embedding generation failed'); - expect(GENERIC_ERROR_MESSAGES.STORAGE_ERROR).toBe('Storage operation failed'); - expect(GENERIC_ERROR_MESSAGES.NOT_INITIALIZED).toBe('Database not initialized'); - expect(GENERIC_ERROR_MESSAGES.CONFIGURATION_ERROR).toBe('Invalid configuration'); - }); - - it('should be a constant object', () => { - // Verify it's read-only (TypeScript const assertion) - expect(Object.isFrozen(GENERIC_ERROR_MESSAGES)).toBe(false); - // But we can verify the keys exist - expect(Object.keys(GENERIC_ERROR_MESSAGES)).toHaveLength(9); - }); - - it('should have generic non-sensitive messages', () => { - // Verify none of the messages contain sensitive patterns - const messages = Object.values(GENERIC_ERROR_MESSAGES); - - messages.forEach((message) => { - expect(message).not.toMatch(/\//); // No paths - expect(message).not.toMatch(/\\/); // No paths - expect(message).not.toMatch(/\d{2,}/); // No IDs or numbers - expect(message).not.toMatch(/password|token|key|secret/i); // No sensitive keywords - }); - }); - }); - - describe('Production mode integration', () => { - it('should consistently hide sensitive information across all utilities', () => { - const sensitiveError = new Error('Failed to connect to redis://user:pass@localhost:6379/db'); - const sensitiveId = 'user-12345-email@example.com'; - - expect(sanitizeErrorMessage(sensitiveError, 'Error', false)).toBe('Error'); - expect(sanitizeDocumentId(sensitiveId, false)).toBe('[document]'); - expect(sanitizeFileError('read', sensitiveError, false)).toBe('Failed to read file'); - expect(sanitizeStorageError('connect', sensitiveError, false)).toBe('Storage operation failed'); - }); - - it('should reveal information in verbose mode for debugging', () => { - const error = new Error('Detailed debug information'); - const id = 'doc-12345'; - - expect(sanitizeErrorMessage(error, 'Error', true)).toContain('Detailed debug information'); - expect(sanitizeDocumentId(id, true)).toBe('doc-12345'); - expect(sanitizeFileError('read', error, true)).toContain('Detailed debug information'); - expect(sanitizeStorageError('connect', error, true)).toContain('Detailed debug information'); - }); - }); -}); diff --git a/libs/vectoriadb/src/__tests__/errors.spec.ts b/libs/vectoriadb/src/__tests__/errors.spec.ts deleted file mode 100644 index 5821860..0000000 --- a/libs/vectoriadb/src/__tests__/errors.spec.ts +++ /dev/null @@ -1,413 +0,0 @@ -import { VectoriaDB } from '../vectoria'; -import { - VectoriaError, - VectoriaNotInitializedError, - DocumentValidationError, - DocumentNotFoundError, - DocumentExistsError, - DuplicateDocumentError, - QueryValidationError, - EmbeddingError, - ConfigurationError, -} from '../errors'; -import type { DocumentMetadata } from '../interfaces'; - -interface TestMetadata extends DocumentMetadata { - category: string; -} - -describe('VectoriaDB Error Handling', () => { - describe('Error Classes', () => { - test('VectoriaError should have correct properties', () => { - const error = new VectoriaError('Test message', 'TEST_CODE'); - expect(error).toBeInstanceOf(Error); - expect(error).toBeInstanceOf(VectoriaError); - expect(error.message).toBe('Test message'); - expect(error.code).toBe('TEST_CODE'); - expect(error.name).toBe('VectoriaError'); - }); - - test('VectoriaNotInitializedError should have correct properties', () => { - const error = new VectoriaNotInitializedError('testing'); - expect(error).toBeInstanceOf(VectoriaError); - expect(error.message).toBe('VectoriaDB must be initialized before testing. Call initialize() first.'); - expect(error.code).toBe('NOT_INITIALIZED'); - expect(error.name).toBe('VectoriaNotInitializedError'); - }); - - test('DocumentValidationError should have correct properties', () => { - const error = new DocumentValidationError('Invalid document', 'doc-1'); - expect(error).toBeInstanceOf(VectoriaError); - expect(error.message).toBe('Invalid document'); - expect(error.code).toBe('DOCUMENT_VALIDATION_ERROR'); - expect(error.documentId).toBe('doc-1'); - expect(error.name).toBe('DocumentValidationError'); - }); - - test('DocumentNotFoundError should have correct properties', () => { - const error = new DocumentNotFoundError('doc-1'); - expect(error).toBeInstanceOf(VectoriaError); - expect(error.message).toBe('Document with id "doc-1" not found'); - expect(error.code).toBe('DOCUMENT_NOT_FOUND'); - expect(error.documentId).toBe('doc-1'); - expect(error.name).toBe('DocumentNotFoundError'); - }); - - test('DocumentExistsError should have correct properties', () => { - const error = new DocumentExistsError('doc-1'); - expect(error).toBeInstanceOf(VectoriaError); - expect(error.message).toContain('already exists'); - expect(error.code).toBe('DOCUMENT_EXISTS'); - expect(error.documentId).toBe('doc-1'); - expect(error.name).toBe('DocumentExistsError'); - }); - - test('DuplicateDocumentError should have correct properties for batch', () => { - const error = new DuplicateDocumentError('doc-1', 'batch'); - expect(error).toBeInstanceOf(VectoriaError); - expect(error.message).toBe('Duplicate document id "doc-1" in batch'); - expect(error.code).toBe('DUPLICATE_DOCUMENT'); - expect(error.documentId).toBe('doc-1'); - expect(error.context).toBe('batch'); - }); - - test('DuplicateDocumentError should have correct properties for existing', () => { - const error = new DuplicateDocumentError('doc-1', 'existing'); - expect(error).toBeInstanceOf(VectoriaError); - expect(error.message).toBe('Document with id "doc-1" already exists'); - expect(error.code).toBe('DUPLICATE_DOCUMENT'); - expect(error.documentId).toBe('doc-1'); - expect(error.context).toBe('existing'); - }); - - test('QueryValidationError should have correct properties', () => { - const error = new QueryValidationError('Invalid query'); - expect(error).toBeInstanceOf(VectoriaError); - expect(error.message).toBe('Invalid query'); - expect(error.code).toBe('QUERY_VALIDATION_ERROR'); - expect(error.name).toBe('QueryValidationError'); - }); - - test('EmbeddingError should have correct properties', () => { - const details = { count: 5 }; - const error = new EmbeddingError('Embedding failed', details); - expect(error).toBeInstanceOf(VectoriaError); - expect(error.message).toBe('Embedding failed'); - expect(error.code).toBe('EMBEDDING_ERROR'); - expect(error.details).toEqual(details); - expect(error.name).toBe('EmbeddingError'); - }); - - test('ConfigurationError should have correct properties', () => { - const error = new ConfigurationError('Invalid config'); - expect(error).toBeInstanceOf(VectoriaError); - expect(error.message).toBe('Invalid config'); - expect(error.code).toBe('CONFIGURATION_ERROR'); - expect(error.name).toBe('ConfigurationError'); - }); - }); - - describe('VectoriaDB Error Throwing', () => { - describe('Not Initialized Errors', () => { - test('should throw VectoriaNotInitializedError when adding before init', async () => { - const db = new VectoriaDB(); - await expect(db.add('doc-1', 'test', { id: 'doc-1', category: 'test' })).rejects.toThrow( - VectoriaNotInitializedError, - ); - - try { - await db.add('doc-1', 'test', { id: 'doc-1', category: 'test' }); - } catch (error) { - expect(error).toBeInstanceOf(VectoriaNotInitializedError); - expect((error as VectoriaNotInitializedError).code).toBe('NOT_INITIALIZED'); - } - }); - - test('should throw VectoriaNotInitializedError when searching before init', async () => { - const db = new VectoriaDB(); - await expect(db.search('test')).rejects.toThrow(VectoriaNotInitializedError); - }); - - test('should throw VectoriaNotInitializedError when updating before init', () => { - const db = new VectoriaDB(); - expect(() => db.updateMetadata('doc-1', { id: 'doc-1', category: 'test' })).toThrow( - VectoriaNotInitializedError, - ); - }); - - test('should throw VectoriaNotInitializedError when getting stats before init', () => { - const db = new VectoriaDB(); - expect(() => db.getStats()).toThrow(VectoriaNotInitializedError); - }); - - test('should throw VectoriaNotInitializedError when saving before init', async () => { - const db = new VectoriaDB(); - await expect(db.saveToStorage()).rejects.toThrow(VectoriaNotInitializedError); - }); - }); - - describe('Document Validation Errors', () => { - let db: VectoriaDB; - - beforeEach(async () => { - db = new VectoriaDB(); - await db.initialize(); - }); - - test('should throw DocumentValidationError for empty text in add()', async () => { - await expect(db.add('doc-1', '', { id: 'doc-1', category: 'test' })).rejects.toThrow(DocumentValidationError); - - try { - await db.add('doc-1', '', { id: 'doc-1', category: 'test' }); - } catch (error) { - expect(error).toBeInstanceOf(DocumentValidationError); - expect((error as DocumentValidationError).documentId).toBe('doc-1'); - expect((error as DocumentValidationError).code).toBe('DOCUMENT_VALIDATION_ERROR'); - } - }); - - test('should throw DocumentValidationError for whitespace-only text', async () => { - await expect(db.add('doc-1', ' ', { id: 'doc-1', category: 'test' })).rejects.toThrow( - DocumentValidationError, - ); - }); - - test('should throw DocumentValidationError for metadata.id mismatch in add()', async () => { - await expect(db.add('doc-1', 'test', { id: 'doc-2', category: 'test' })).rejects.toThrow( - DocumentValidationError, - ); - - try { - await db.add('doc-1', 'test', { id: 'doc-2', category: 'test' }); - } catch (error) { - expect(error).toBeInstanceOf(DocumentValidationError); - expect((error as DocumentValidationError).message).toContain('does not match'); - } - }); - - test('should throw DocumentValidationError for empty text in addMany()', async () => { - await expect( - db.addMany([ - { id: 'doc-1', text: 'valid', metadata: { id: 'doc-1', category: 'test' } }, - { id: 'doc-2', text: '', metadata: { id: 'doc-2', category: 'test' } }, - ]), - ).rejects.toThrow(DocumentValidationError); - }); - - test('should throw DocumentValidationError for metadata.id mismatch in addMany()', async () => { - await expect( - db.addMany([{ id: 'doc-1', text: 'test', metadata: { id: 'doc-2', category: 'test' } }]), - ).rejects.toThrow(DocumentValidationError); - }); - - test('should throw DocumentValidationError for empty text in update()', async () => { - await db.add('doc-1', 'original', { id: 'doc-1', category: 'test' }); - await expect(db.update('doc-1', { text: '' })).rejects.toThrow(DocumentValidationError); - }); - - test('should throw DocumentValidationError for empty text in updateMany()', async () => { - await db.add('doc-1', 'original', { id: 'doc-1', category: 'test' }); - await expect(db.updateMany([{ id: 'doc-1', text: ' ' }])).rejects.toThrow(DocumentValidationError); - }); - }); - - describe('Document Exists Errors', () => { - let db: VectoriaDB; - - beforeEach(async () => { - db = new VectoriaDB(); - await db.initialize(); - await db.add('existing', 'test', { id: 'existing', category: 'test' }); - }); - - test('should throw DocumentExistsError when adding duplicate', async () => { - await expect(db.add('existing', 'new text', { id: 'existing', category: 'test' })).rejects.toThrow( - DocumentExistsError, - ); - - try { - await db.add('existing', 'new text', { id: 'existing', category: 'test' }); - } catch (error) { - expect(error).toBeInstanceOf(DocumentExistsError); - expect((error as DocumentExistsError).documentId).toBe('existing'); - expect((error as DocumentExistsError).code).toBe('DOCUMENT_EXISTS'); - } - }); - }); - - describe('Duplicate Document Errors', () => { - let db: VectoriaDB; - - beforeEach(async () => { - db = new VectoriaDB(); - await db.initialize(); - }); - - test('should throw DuplicateDocumentError for duplicate in batch', async () => { - await expect( - db.addMany([ - { id: 'doc-1', text: 'first', metadata: { id: 'doc-1', category: 'test' } }, - { id: 'doc-1', text: 'second', metadata: { id: 'doc-1', category: 'test' } }, - ]), - ).rejects.toThrow(DuplicateDocumentError); - - try { - await db.addMany([ - { id: 'doc-1', text: 'first', metadata: { id: 'doc-1', category: 'test' } }, - { id: 'doc-1', text: 'second', metadata: { id: 'doc-1', category: 'test' } }, - ]); - } catch (error) { - expect(error).toBeInstanceOf(DuplicateDocumentError); - expect((error as DuplicateDocumentError).context).toBe('batch'); - expect((error as DuplicateDocumentError).documentId).toBe('doc-1'); - } - }); - - test('should throw DuplicateDocumentError for existing document in batch', async () => { - await db.add('existing', 'test', { id: 'existing', category: 'test' }); - - await expect( - db.addMany([{ id: 'existing', text: 'new', metadata: { id: 'existing', category: 'test' } }]), - ).rejects.toThrow(DuplicateDocumentError); - - try { - await db.addMany([{ id: 'existing', text: 'new', metadata: { id: 'existing', category: 'test' } }]); - } catch (error) { - expect(error).toBeInstanceOf(DuplicateDocumentError); - expect((error as DuplicateDocumentError).context).toBe('existing'); - } - }); - }); - - describe('Document Not Found Errors', () => { - let db: VectoriaDB; - - beforeEach(async () => { - db = new VectoriaDB(); - await db.initialize(); - }); - - test('should throw DocumentNotFoundError in updateMetadata()', () => { - expect(() => db.updateMetadata('nonexistent', { id: 'nonexistent', category: 'test' })).toThrow( - DocumentNotFoundError, - ); - - try { - db.updateMetadata('nonexistent', { id: 'nonexistent', category: 'test' }); - } catch (error) { - expect(error).toBeInstanceOf(DocumentNotFoundError); - expect((error as DocumentNotFoundError).documentId).toBe('nonexistent'); - expect((error as DocumentNotFoundError).code).toBe('DOCUMENT_NOT_FOUND'); - } - }); - - test('should throw DocumentNotFoundError in update()', async () => { - await expect(db.update('nonexistent', { text: 'new' })).rejects.toThrow(DocumentNotFoundError); - }); - - test('should throw DocumentNotFoundError in updateMany()', async () => { - await expect(db.updateMany([{ id: 'nonexistent', text: 'new' }])).rejects.toThrow(DocumentNotFoundError); - }); - }); - - describe('Query Validation Errors', () => { - let db: VectoriaDB; - - beforeEach(async () => { - db = new VectoriaDB(); - await db.initialize(); - }); - - test('should throw QueryValidationError for empty query', async () => { - await expect(db.search('')).rejects.toThrow(QueryValidationError); - - try { - await db.search(''); - } catch (error) { - expect(error).toBeInstanceOf(QueryValidationError); - expect((error as QueryValidationError).code).toBe('QUERY_VALIDATION_ERROR'); - expect((error as QueryValidationError).message).toContain('empty'); - } - }); - - test('should throw QueryValidationError for whitespace-only query', async () => { - await expect(db.search(' ')).rejects.toThrow(QueryValidationError); - }); - - test('should throw QueryValidationError for invalid topK', async () => { - await expect(db.search('test', { topK: 0 })).rejects.toThrow(QueryValidationError); - await expect(db.search('test', { topK: -5 })).rejects.toThrow(QueryValidationError); - - try { - await db.search('test', { topK: 0 }); - } catch (error) { - expect(error).toBeInstanceOf(QueryValidationError); - expect((error as QueryValidationError).message).toContain('topK'); - } - }); - - test('should throw QueryValidationError for invalid threshold', async () => { - await expect(db.search('test', { threshold: -0.1 })).rejects.toThrow(QueryValidationError); - await expect(db.search('test', { threshold: 1.5 })).rejects.toThrow(QueryValidationError); - - try { - await db.search('test', { threshold: 2.0 }); - } catch (error) { - expect(error).toBeInstanceOf(QueryValidationError); - expect((error as QueryValidationError).message).toContain('threshold'); - } - }); - }); - - describe('Error Catching by Type', () => { - let db: VectoriaDB; - - beforeEach(async () => { - db = new VectoriaDB(); - await db.initialize(); - }); - - test('developers can catch specific error types', async () => { - try { - await db.add('doc-1', '', { id: 'doc-1', category: 'test' }); - fail('Should have thrown error'); - } catch (error) { - if (error instanceof DocumentValidationError) { - expect(error.code).toBe('DOCUMENT_VALIDATION_ERROR'); - expect(error.documentId).toBe('doc-1'); - } else { - fail('Wrong error type'); - } - } - }); - - test('developers can catch base VectoriaError', async () => { - try { - await db.search(''); - fail('Should have thrown error'); - } catch (error) { - if (error instanceof VectoriaError) { - expect(error.code).toBe('QUERY_VALIDATION_ERROR'); - } else { - fail('Wrong error type'); - } - } - }); - - test('developers can check error codes', async () => { - await db.add('doc-1', 'test', { id: 'doc-1', category: 'test' }); - - try { - await db.add('doc-1', 'duplicate', { id: 'doc-1', category: 'test' }); - } catch (error) { - if (error instanceof VectoriaError) { - if (error.code === 'DOCUMENT_EXISTS') { - // Handle duplicate document - expect(true).toBe(true); - } - } - } - }); - }); - }); -}); diff --git a/libs/vectoriadb/src/__tests__/hnsw.spec.ts b/libs/vectoriadb/src/__tests__/hnsw.spec.ts deleted file mode 100644 index b8cee1e..0000000 --- a/libs/vectoriadb/src/__tests__/hnsw.spec.ts +++ /dev/null @@ -1,311 +0,0 @@ -import { HNSWIndex } from '../hnsw.index'; - -describe('HNSWIndex', () => { - let index: HNSWIndex; - - beforeEach(() => { - index = new HNSWIndex(); - }); - - describe('initialization', () => { - test('should create empty index', () => { - expect(index.size()).toBe(0); - }); - - test('should accept custom configuration', () => { - const customIndex = new HNSWIndex({ - M: 32, - M0: 64, - efConstruction: 400, - efSearch: 100, - }); - expect(customIndex.size()).toBe(0); - }); - }); - - describe('insert', () => { - test('should insert single vector', () => { - const vector = new Float32Array([1, 0, 0, 0]); - index.insert('doc-1', vector); - expect(index.size()).toBe(1); - }); - - test('should insert multiple vectors', () => { - const vectors = [new Float32Array([1, 0, 0, 0]), new Float32Array([0, 1, 0, 0]), new Float32Array([0, 0, 1, 0])]; - - vectors.forEach((vec, i) => { - index.insert(`doc-${i}`, vec); - }); - - expect(index.size()).toBe(3); - }); - - test('should handle high-dimensional vectors', () => { - const dim = 384; - const vector = new Float32Array(dim); - for (let i = 0; i < dim; i++) { - vector[i] = Math.random(); - } - - index.insert('doc-1', vector); - expect(index.size()).toBe(1); - }); - }); - - describe('search', () => { - beforeEach(() => { - // Insert some test vectors - const vectors = [ - { id: 'doc-1', vector: new Float32Array([1, 0, 0, 0]) }, - { id: 'doc-2', vector: new Float32Array([0.9, 0.1, 0, 0]) }, - { id: 'doc-3', vector: new Float32Array([0, 1, 0, 0]) }, - { id: 'doc-4', vector: new Float32Array([0, 0, 1, 0]) }, - { id: 'doc-5', vector: new Float32Array([0, 0, 0, 1]) }, - ]; - - vectors.forEach(({ id, vector }) => { - index.insert(id, vector); - }); - }); - - test('should find nearest neighbor', () => { - const query = new Float32Array([1, 0, 0, 0]); - const results = index.search(query, 1); - - expect(results.length).toBe(1); - expect(results[0].id).toBe('doc-1'); - expect(results[0].distance).toBeCloseTo(0, 5); - }); - - test('should find top-k neighbors', () => { - const query = new Float32Array([1, 0, 0, 0]); - const results = index.search(query, 3); - - expect(results.length).toBe(3); - expect(results[0].id).toBe('doc-1'); - expect(results[1].id).toBe('doc-2'); - }); - - test('should return empty results for empty index', () => { - const emptyIndex = new HNSWIndex(); - const query = new Float32Array([1, 0, 0, 0]); - const results = emptyIndex.search(query, 5); - - expect(results.length).toBe(0); - }); - - test('should handle k larger than index size', () => { - const query = new Float32Array([1, 0, 0, 0]); - const results = index.search(query, 100); - - expect(results.length).toBe(5); // Only 5 vectors in index - }); - - test('should use custom efSearch parameter', () => { - const query = new Float32Array([1, 0, 0, 0]); - const results = index.search(query, 3, 100); - - expect(results.length).toBe(3); - }); - - test('should find approximate neighbors', () => { - // Create fresh index for this test with different dimensions - const testIndex = new HNSWIndex(); - - // Insert many vectors to test approximate search - const numVectors = 100; - for (let i = 0; i < numVectors; i++) { - const vector = new Float32Array(10); - for (let j = 0; j < 10; j++) { - vector[j] = Math.random(); - } - testIndex.insert(`doc-${i}`, vector); - } - - const query = new Float32Array(10); - for (let i = 0; i < 10; i++) { - query[i] = Math.random(); - } - - const results = testIndex.search(query, 10); - expect(results.length).toBe(10); - // Results should be sorted by distance (ascending) - for (let i = 1; i < results.length; i++) { - expect(results[i].distance).toBeGreaterThanOrEqual(results[i - 1].distance); - } - }); - }); - - describe('remove', () => { - beforeEach(() => { - // Insert test vectors - for (let i = 0; i < 5; i++) { - const vector = new Float32Array([i, 0, 0, 0]); - index.insert(`doc-${i}`, vector); - } - }); - - test('should remove existing vector', () => { - expect(index.size()).toBe(5); - const removed = index.remove('doc-2'); - expect(removed).toBe(true); - expect(index.size()).toBe(4); - }); - - test('should return false for non-existent vector', () => { - const removed = index.remove('doc-999'); - expect(removed).toBe(false); - expect(index.size()).toBe(5); - }); - - test('should update search results after removal', () => { - index.remove('doc-0'); - - const query = new Float32Array([0, 0, 0, 0]); - const results = index.search(query, 5); - - expect(results.length).toBe(4); - expect(results.find((r) => r.id === 'doc-0')).toBeUndefined(); - }); - - test('should handle removing entry point', () => { - // Remove the first inserted vector (likely entry point) - index.remove('doc-0'); - - const query = new Float32Array([1, 0, 0, 0]); - const results = index.search(query, 3); - - expect(results.length).toBeGreaterThan(0); - }); - - test('should handle removing all vectors', () => { - for (let i = 0; i < 5; i++) { - index.remove(`doc-${i}`); - } - - expect(index.size()).toBe(0); - - const query = new Float32Array([1, 0, 0, 0]); - const results = index.search(query, 5); - expect(results.length).toBe(0); - }); - }); - - describe('clear', () => { - test('should clear all vectors', () => { - for (let i = 0; i < 10; i++) { - const vector = new Float32Array([i, 0, 0, 0]); - index.insert(`doc-${i}`, vector); - } - - expect(index.size()).toBe(10); - index.clear(); - expect(index.size()).toBe(0); - }); - - test('should allow re-insertion after clear', () => { - const vector = new Float32Array([1, 0, 0, 0]); - index.insert('doc-1', vector); - index.clear(); - - index.insert('doc-2', vector); - expect(index.size()).toBe(1); - - const results = index.search(vector, 1); - expect(results[0].id).toBe('doc-2'); - }); - }); - - describe('performance characteristics', () => { - test('should maintain graph connectivity', () => { - // Insert many vectors and ensure all are searchable - const numVectors = 50; - const vectors: Array<{ id: string; vector: Float32Array }> = []; - - for (let i = 0; i < numVectors; i++) { - const vector = new Float32Array(10); - for (let j = 0; j < 10; j++) { - vector[j] = Math.random(); - } - vectors.push({ id: `doc-${i}`, vector }); - index.insert(`doc-${i}`, vector); - } - - // Verify each vector can be found - for (const { id, vector } of vectors) { - const results = index.search(vector, 1); - expect(results.length).toBeGreaterThan(0); - // The exact vector should be the closest (distance ~0) - expect(results[0].id).toBe(id); - expect(results[0].distance).toBeCloseTo(0, 5); - } - }); - - test('should handle identical vectors', () => { - const vector = new Float32Array([1, 0, 0, 0]); - - index.insert('doc-1', vector); - index.insert('doc-2', vector); - index.insert('doc-3', vector); - - const results = index.search(vector, 3); - expect(results.length).toBe(3); - // All should have distance ~0 - results.forEach((result) => { - expect(result.distance).toBeCloseTo(0, 5); - }); - }); - - test('should assign different levels to nodes', () => { - // Insert many nodes and check that some get higher levels - // This is probabilistic but with 100 nodes, very likely to have multi-level structure - for (let i = 0; i < 100; i++) { - const vector = new Float32Array(10); - for (let j = 0; j < 10; j++) { - vector[j] = Math.random(); - } - index.insert(`doc-${i}`, vector); - } - - // If HNSW is working, should be able to search efficiently - const query = new Float32Array(10); - for (let i = 0; i < 10; i++) { - query[i] = Math.random(); - } - - const results = index.search(query, 10); - expect(results.length).toBe(10); - }); - }); - - describe('edge cases', () => { - test('should handle zero vectors', () => { - const zero = new Float32Array([0, 0, 0, 0]); - index.insert('zero', zero); - - const results = index.search(zero, 1); - expect(results[0].id).toBe('zero'); - }); - - test('should handle normalized vectors', () => { - // Unit vectors - const v1 = new Float32Array([1 / Math.sqrt(2), 1 / Math.sqrt(2), 0, 0]); - const v2 = new Float32Array([1 / Math.sqrt(2), -1 / Math.sqrt(2), 0, 0]); - - index.insert('v1', v1); - index.insert('v2', v2); - - const results = index.search(v1, 2); - expect(results[0].id).toBe('v1'); - }); - - test('should handle single vector index', () => { - const vector = new Float32Array([1, 2, 3, 4]); - index.insert('only', vector); - - const results = index.search(vector, 1); - expect(results.length).toBe(1); - expect(results[0].id).toBe('only'); - }); - }); -}); diff --git a/libs/vectoriadb/src/__tests__/regex.spec.ts b/libs/vectoriadb/src/__tests__/regex.spec.ts deleted file mode 100644 index ed159c9..0000000 --- a/libs/vectoriadb/src/__tests__/regex.spec.ts +++ /dev/null @@ -1,330 +0,0 @@ -/** - * Tests for regex utilities and ReDoS protection - */ - -import { isPotentiallyVulnerableRegex, createSafeRegex, safeTest, SAFE_PATTERNS } from '../regex.utils'; - -describe('Regex Utilities', () => { - describe('isPotentiallyVulnerableRegex', () => { - describe('vulnerable patterns', () => { - it('should detect nested quantifiers - (a+)+', () => { - expect(isPotentiallyVulnerableRegex('(a+)+$')).toBe(true); - }); - - it('should detect nested quantifiers - (a*)*', () => { - expect(isPotentiallyVulnerableRegex('(a*)*')).toBe(true); - }); - - it('should detect nested quantifiers with curly braces - (a{1,3})+', () => { - expect(isPotentiallyVulnerableRegex('(a{1,3})+')).toBe(true); - }); - - it('should detect alternation with overlapping patterns - (a|ab)*', () => { - expect(isPotentiallyVulnerableRegex('(a|ab)*$')).toBe(true); - }); - - it('should detect alternation with quantifiers - (x|xy)+', () => { - expect(isPotentiallyVulnerableRegex('(x|xy)+')).toBe(true); - }); - - it('should detect repeated groups with quantifiers - (a+)+', () => { - expect(isPotentiallyVulnerableRegex('(a+)+')).toBe(true); - }); - - it('should detect complex nested quantifiers - ((a+)+)+', () => { - expect(isPotentiallyVulnerableRegex('((a+)+)+')).toBe(true); - }); - - it('should detect alternation with star quantifier - (foo|foobar)*', () => { - expect(isPotentiallyVulnerableRegex('(foo|foobar)*')).toBe(true); - }); - }); - - describe('safe patterns', () => { - it('should not flag simple character class as vulnerable', () => { - expect(isPotentiallyVulnerableRegex('[a-zA-Z0-9]+')).toBe(false); - }); - - it('should not flag simple alternation without quantifiers', () => { - expect(isPotentiallyVulnerableRegex('(foo|bar)')).toBe(false); - }); - - it('should not flag single quantifier without nesting', () => { - expect(isPotentiallyVulnerableRegex('a+')).toBe(false); - }); - - it('should not flag word boundary patterns', () => { - expect(isPotentiallyVulnerableRegex('\\b\\w+\\b')).toBe(false); - }); - - it('should not flag digit patterns', () => { - expect(isPotentiallyVulnerableRegex('\\d{1,3}')).toBe(false); - }); - - it('should not flag simple dot pattern', () => { - expect(isPotentiallyVulnerableRegex('.*')).toBe(false); - }); - - it('should not flag negated character class', () => { - expect(isPotentiallyVulnerableRegex('[^a-z]+')).toBe(false); - }); - }); - }); - - describe('createSafeRegex', () => { - it('should create a function that matches valid input', () => { - const safeRegex = createSafeRegex('[a-z]+'); - const result = safeRegex('hello'); - - expect(result).not.toBeNull(); - expect(result![0]).toBe('hello'); - }); - - it('should create a function that handles non-matching input', () => { - const safeRegex = createSafeRegex('^[0-9]+$'); - const result = safeRegex('abc'); - - expect(result).toBeNull(); - }); - - it('should accept RegExp object as pattern', () => { - const pattern = /test/i; - const safeRegex = createSafeRegex(pattern); - const result = safeRegex('TEST'); - - expect(result).not.toBeNull(); - expect(result![0]).toBe('TEST'); - }); - - it('should accept flags parameter', () => { - const safeRegex = createSafeRegex('hello', 'i'); - const result = safeRegex('HELLO'); - - expect(result).not.toBeNull(); - expect(result![0]).toBe('HELLO'); - }); - - it('should truncate input longer than 10000 characters', () => { - const safeRegex = createSafeRegex('a+'); - const longInput = 'a'.repeat(15000); - const result = safeRegex(longInput); - - expect(result).not.toBeNull(); - // Input should be truncated to 10000 chars - expect(result![0]).toHaveLength(10000); - }); - - it('should handle never-matching patterns', () => { - const safeRegex = createSafeRegex('(?!)'); // Negative lookahead that never matches - const result = safeRegex('test'); - - expect(result).toBeNull(); - }); - - it('should work with global flag', () => { - const safeRegex = createSafeRegex('\\d+', 'g'); - const result = safeRegex('123 456'); - - expect(result).not.toBeNull(); - expect(result![0]).toBe('123'); - }); - - it('should handle empty input', () => { - const safeRegex = createSafeRegex('.*'); - const result = safeRegex(''); - - expect(result).not.toBeNull(); - expect(result![0]).toBe(''); - }); - - it('should handle complex patterns safely', () => { - const safeRegex = createSafeRegex('\\b[A-Z][a-z]+\\b'); - const result = safeRegex('Hello World'); - - expect(result).not.toBeNull(); - expect(result![0]).toBe('Hello'); - }); - }); - - describe('safeTest', () => { - it('should return true for matching pattern', () => { - const pattern = /^[a-z]+$/; - expect(safeTest('hello', pattern)).toBe(true); - }); - - it('should return false for non-matching pattern', () => { - const pattern = /^[0-9]+$/; - expect(safeTest('abc', pattern)).toBe(false); - }); - - it('should return false for input exceeding maxLength', () => { - const pattern = /a+/; - const longInput = 'a'.repeat(15000); - - expect(safeTest(longInput, pattern, 10000)).toBe(false); - }); - - it('should use default maxLength of 10000', () => { - const pattern = /a+/; - const input = 'a'.repeat(9999); - - expect(safeTest(input, pattern)).toBe(true); - }); - - it('should accept custom maxLength', () => { - const pattern = /test/; - const input = 'test'; - - expect(safeTest(input, pattern, 100)).toBe(true); - }); - - it('should handle never-matching patterns', () => { - const pattern = /(?!)/; // Negative lookahead that never matches - expect(safeTest('test', pattern)).toBe(false); - }); - - it('should work with case-insensitive patterns', () => { - const pattern = /hello/i; - expect(safeTest('HELLO', pattern)).toBe(true); - }); - - it('should work with multiline patterns', () => { - const pattern = /^test$/m; - expect(safeTest('foo\ntest\nbar', pattern)).toBe(true); - }); - - it('should return false for empty string with strict pattern', () => { - const pattern = /^[a-z]+$/; - expect(safeTest('', pattern)).toBe(false); - }); - - it('should handle special regex characters', () => { - const pattern = /\d{3}-\d{4}/; - expect(safeTest('123-4567', pattern)).toBe(true); - }); - - it('should enforce length limit strictly', () => { - const pattern = /a+/; - const input = 'a'.repeat(101); - - expect(safeTest(input, pattern, 100)).toBe(false); - }); - }); - - describe('SAFE_PATTERNS', () => { - it('should have CONTROL_CHARS pattern', () => { - expect(SAFE_PATTERNS.CONTROL_CHARS).toBeInstanceOf(RegExp); - expect(SAFE_PATTERNS.CONTROL_CHARS.global).toBe(true); - }); - - it('CONTROL_CHARS should match newlines', () => { - expect('test\n'.replace(SAFE_PATTERNS.CONTROL_CHARS, '')).toBe('test'); - }); - - it('CONTROL_CHARS should match carriage returns', () => { - expect('test\r'.replace(SAFE_PATTERNS.CONTROL_CHARS, '')).toBe('test'); - }); - - it('CONTROL_CHARS should match tabs', () => { - expect('test\t'.replace(SAFE_PATTERNS.CONTROL_CHARS, '')).toBe('test'); - }); - - it('CONTROL_CHARS should match null bytes', () => { - expect('test\0'.replace(SAFE_PATTERNS.CONTROL_CHARS, '')).toBe('test'); - }); - - it('should have PATH_SEPARATORS pattern', () => { - expect(SAFE_PATTERNS.PATH_SEPARATORS).toBeInstanceOf(RegExp); - expect(SAFE_PATTERNS.PATH_SEPARATORS.global).toBe(true); - }); - - it('PATH_SEPARATORS should match forward slash', () => { - expect('test/path'.replace(SAFE_PATTERNS.PATH_SEPARATORS, '-')).toBe('test-path'); - }); - - it('PATH_SEPARATORS should match backslash', () => { - expect('test\\path'.replace(SAFE_PATTERNS.PATH_SEPARATORS, '-')).toBe('test-path'); - }); - - it('should have DIR_TRAVERSAL pattern', () => { - expect(SAFE_PATTERNS.DIR_TRAVERSAL).toBeInstanceOf(RegExp); - expect(SAFE_PATTERNS.DIR_TRAVERSAL.global).toBe(true); - }); - - it('DIR_TRAVERSAL should match double dots', () => { - expect('../test/..'.replace(SAFE_PATTERNS.DIR_TRAVERSAL, '')).toBe('/test/'); - }); - - it('should have ALPHANUMERIC_SAFE pattern', () => { - expect(SAFE_PATTERNS.ALPHANUMERIC_SAFE).toBeInstanceOf(RegExp); - expect(SAFE_PATTERNS.ALPHANUMERIC_SAFE.global).toBe(true); - }); - - it('ALPHANUMERIC_SAFE should keep word characters', () => { - expect('test123_ABC'.replace(SAFE_PATTERNS.ALPHANUMERIC_SAFE, '')).toBe('test123_ABC'); - }); - - it('ALPHANUMERIC_SAFE should keep hyphens', () => { - expect('test-name'.replace(SAFE_PATTERNS.ALPHANUMERIC_SAFE, '')).toBe('test-name'); - }); - - it('ALPHANUMERIC_SAFE should remove special characters', () => { - expect('test@#$%'.replace(SAFE_PATTERNS.ALPHANUMERIC_SAFE, '')).toBe('test'); - }); - - it('should have REDIS_KEY_SAFE pattern', () => { - expect(SAFE_PATTERNS.REDIS_KEY_SAFE).toBeInstanceOf(RegExp); - expect(SAFE_PATTERNS.REDIS_KEY_SAFE.global).toBe(true); - }); - - it('REDIS_KEY_SAFE should keep word characters, colons, dots, dashes', () => { - expect('test:key.name-1'.replace(SAFE_PATTERNS.REDIS_KEY_SAFE, '')).toBe('test:key.name-1'); - }); - - it('REDIS_KEY_SAFE should remove unsafe characters', () => { - expect('test@#$%'.replace(SAFE_PATTERNS.REDIS_KEY_SAFE, '')).toBe('test'); - }); - - it('should have LEADING_DOTS_DASHES pattern', () => { - expect(SAFE_PATTERNS.LEADING_DOTS_DASHES).toBeInstanceOf(RegExp); - }); - - it('LEADING_DOTS_DASHES should match leading dots', () => { - expect('...test'.replace(SAFE_PATTERNS.LEADING_DOTS_DASHES, '')).toBe('test'); - }); - - it('LEADING_DOTS_DASHES should match leading dashes', () => { - expect('---test'.replace(SAFE_PATTERNS.LEADING_DOTS_DASHES, '')).toBe('test'); - }); - - it('LEADING_DOTS_DASHES should match mixed leading dots and dashes', () => { - expect('.-.-test'.replace(SAFE_PATTERNS.LEADING_DOTS_DASHES, '')).toBe('test'); - }); - - it('should have TRAILING_DOTS_DASHES pattern', () => { - expect(SAFE_PATTERNS.TRAILING_DOTS_DASHES).toBeInstanceOf(RegExp); - }); - - it('TRAILING_DOTS_DASHES should match trailing dots', () => { - expect('test...'.replace(SAFE_PATTERNS.TRAILING_DOTS_DASHES, '')).toBe('test'); - }); - - it('TRAILING_DOTS_DASHES should match trailing dashes', () => { - expect('test---'.replace(SAFE_PATTERNS.TRAILING_DOTS_DASHES, '')).toBe('test'); - }); - - it('TRAILING_DOTS_DASHES should match mixed trailing dots and dashes', () => { - expect('test.-.-'.replace(SAFE_PATTERNS.TRAILING_DOTS_DASHES, '')).toBe('test'); - }); - - it('all SAFE_PATTERNS should not be vulnerable to ReDoS', () => { - expect(isPotentiallyVulnerableRegex(SAFE_PATTERNS.CONTROL_CHARS.source)).toBe(false); - expect(isPotentiallyVulnerableRegex(SAFE_PATTERNS.PATH_SEPARATORS.source)).toBe(false); - expect(isPotentiallyVulnerableRegex(SAFE_PATTERNS.DIR_TRAVERSAL.source)).toBe(false); - expect(isPotentiallyVulnerableRegex(SAFE_PATTERNS.ALPHANUMERIC_SAFE.source)).toBe(false); - expect(isPotentiallyVulnerableRegex(SAFE_PATTERNS.REDIS_KEY_SAFE.source)).toBe(false); - expect(isPotentiallyVulnerableRegex(SAFE_PATTERNS.LEADING_DOTS_DASHES.source)).toBe(false); - expect(isPotentiallyVulnerableRegex(SAFE_PATTERNS.TRAILING_DOTS_DASHES.source)).toBe(false); - }); - }); -}); diff --git a/libs/vectoriadb/src/__tests__/security.spec.ts b/libs/vectoriadb/src/__tests__/security.spec.ts deleted file mode 100644 index 06f4477..0000000 --- a/libs/vectoriadb/src/__tests__/security.spec.ts +++ /dev/null @@ -1,439 +0,0 @@ -/** - * Security Tests for VectoriaDB - * Comprehensive security validation and protection mechanisms - */ - -import { VectoriaDB } from '../vectoria'; -import { FileStorageAdapter } from '../storage/file.adapter'; -import { RedisStorageAdapter } from '../storage/redis.adapter'; -import * as SerializationUtils from '../storage/serialization.utils'; -import { isPotentiallyVulnerableRegex, SAFE_PATTERNS } from '../regex.utils'; -import { DocumentValidationError } from '../errors'; -import { createHash } from 'crypto'; -import * as fs from 'fs/promises'; -import * as path from 'path'; - -describe('Security Tests', () => { - describe('Path Traversal Protection', () => { - it('should prevent directory traversal in FileStorageAdapter namespace', async () => { - const maliciousNamespace = '../../../etc/passwd'; - const adapter = new FileStorageAdapter({ - namespace: maliciousNamespace, - cacheDir: './.cache/security-test', - }); - - await adapter.initialize(); - - // Verify the file path doesn't escape the cache directory - const filePath = (adapter as any).filePath; - const resolvedPath = path.resolve(filePath); - const cacheDir = path.resolve('./.cache/security-test'); - - expect(resolvedPath.startsWith(cacheDir)).toBe(true); - expect(resolvedPath).not.toContain('..'); - expect(resolvedPath).not.toContain('etc/passwd'); - - await adapter.close(); - }); - - it('should sanitize namespace with path separators', async () => { - const maliciousNamespace = 'test/../../admin'; - const adapter = new FileStorageAdapter({ - namespace: maliciousNamespace, - cacheDir: './.cache/security-test', - }); - - await adapter.initialize(); - - const filePath = (adapter as any).filePath; - // Path separators in the actual path are OK, but the namespace part should be sanitized - expect(filePath).not.toContain('../'); - // Check that path separators from the malicious input were sanitized - expect(filePath).toContain('test---admin'); // each / becomes -, .. removed leaves /// - - await adapter.close(); - }); - - it('should sanitize extremely malicious namespace', () => { - // Create adapter with extremely malicious namespace - it should sanitize, not throw - const adapter = new FileStorageAdapter({ - namespace: '../'.repeat(100), - cacheDir: './.cache/security-test', - }); - - const filePath = (adapter as any).filePath; - const resolvedPath = path.resolve(filePath); - const cacheDir = path.resolve('./.cache/security-test'); - - // Should still be within cache directory - expect(resolvedPath.startsWith(cacheDir)).toBe(true); - }); - }); - - describe('Prototype Pollution Protection', () => { - it('should prevent prototype pollution via __proto__ in JSON', () => { - const maliciousJson = JSON.stringify({ - metadata: { - version: '1.0.0', - toolsHash: 'abc123', - timestamp: Date.now(), - modelName: 'test', - dimensions: 384, - documentCount: 1, - }, - embeddings: [ - { - id: 'test-1', - vector: [0.1, 0.2, 0.3], - metadata: { - __proto__: { isAdmin: true }, - id: 'test-1', - category: 'test', - }, - text: 'test document', - createdAt: new Date().toISOString(), - }, - ], - }); - - const parsed = JSON.parse(maliciousJson); - const sanitized = SerializationUtils.sanitizeObject(parsed) as any; - - // Verify __proto__ was blocked (check it's not an own property) - expect(sanitized).toBeTruthy(); - expect(Object.prototype.hasOwnProperty.call(sanitized.embeddings[0].metadata, '__proto__')).toBe(false); - - // Verify prototype wasn't polluted - const testObj: any = {}; - expect(testObj.isAdmin).toBeUndefined(); - }); - - it('should prevent prototype pollution via constructor in JSON', () => { - const maliciousJson = JSON.stringify({ - metadata: { - version: '1.0.0', - toolsHash: 'abc123', - timestamp: Date.now(), - modelName: 'test', - dimensions: 384, - documentCount: 1, - }, - embeddings: [ - { - id: 'test-1', - vector: [0.1, 0.2, 0.3], - metadata: { - constructor: { prototype: { isAdmin: true } }, - id: 'test-1', - category: 'test', - }, - text: 'test document', - createdAt: new Date().toISOString(), - }, - ], - }); - - const parsed = JSON.parse(maliciousJson); - const sanitized = SerializationUtils.sanitizeObject(parsed) as any; - - // Verify constructor was blocked (check it's not an own property) - expect(sanitized).toBeTruthy(); - expect(Object.prototype.hasOwnProperty.call(sanitized.embeddings[0].metadata, 'constructor')).toBe(false); - }); - - it('should sanitize nested objects recursively', () => { - const maliciousJson = JSON.stringify({ - metadata: { - version: '1.0.0', - toolsHash: 'abc123', - timestamp: Date.now(), - modelName: 'test', - dimensions: 384, - documentCount: 1, - }, - embeddings: [ - { - id: 'test-1', - vector: [0.1, 0.2, 0.3], - metadata: { - id: 'test-1', - nested: { - deeper: { - __proto__: { polluted: true }, - safeField: 'value', - }, - }, - }, - text: 'test document', - createdAt: new Date().toISOString(), - }, - ], - }); - - const parsed = JSON.parse(maliciousJson); - const sanitized = SerializationUtils.sanitizeObject(parsed) as any; - - expect(sanitized).toBeTruthy(); - expect(Object.prototype.hasOwnProperty.call(sanitized.embeddings[0].metadata.nested.deeper, '__proto__')).toBe( - false, - ); - expect(sanitized.embeddings[0].metadata.nested.deeper.safeField).toBe('value'); - }); - }); - - describe('Resource Limit Enforcement', () => { - let db: VectoriaDB; - - beforeEach(async () => { - db = new VectoriaDB({ - maxDocuments: 10, - maxDocumentSize: 100, - maxBatchSize: 5, - }); - await db.initialize(); - }); - - afterEach(async () => { - await db.close(); - }); - - it('should reject adding documents beyond maxDocuments limit', async () => { - // Add up to the limit - for (let i = 0; i < 10; i++) { - await db.add(`doc-${i}`, `Document ${i}`, { id: `doc-${i}` }); - } - - // Try to add one more - await expect(db.add('doc-11', 'Exceeds limit', { id: 'doc-11' })).rejects.toThrow(DocumentValidationError); - await expect(db.add('doc-11', 'Exceeds limit', { id: 'doc-11' })).rejects.toThrow(/Document limit exceeded/); - }); - - it('should reject documents exceeding maxDocumentSize', async () => { - const largeText = 'a'.repeat(101); // 101 characters - - await expect(db.add('large-doc', largeText, { id: 'large-doc' })).rejects.toThrow(DocumentValidationError); - await expect(db.add('large-doc', largeText, { id: 'large-doc' })).rejects.toThrow(/exceeds maximum size/); - }); - - it('should reject batch operations exceeding maxBatchSize', async () => { - const largeBatch = Array.from({ length: 6 }, (_, i) => ({ - id: `batch-${i}`, - text: `Document ${i}`, - metadata: { id: `batch-${i}` }, - })); - - await expect(db.addMany(largeBatch)).rejects.toThrow(DocumentValidationError); - await expect(db.addMany(largeBatch)).rejects.toThrow(/Batch size exceeds maximum/); - }); - - it('should reject batch that would exceed total document limit', async () => { - // Add 8 documents - for (let i = 0; i < 8; i++) { - await db.add(`doc-${i}`, `Document ${i}`, { id: `doc-${i}` }); - } - - // Try to add batch of 5 (total would be 13, exceeds limit of 10) - const batch = Array.from({ length: 5 }, (_, i) => ({ - id: `batch-${i}`, - text: `Document ${i}`, - metadata: { id: `batch-${i}` }, - })); - - await expect(db.addMany(batch)).rejects.toThrow(DocumentValidationError); - await expect(db.addMany(batch)).rejects.toThrow(/would exceed maximum document limit/); - }); - - it('should reject update with oversized text', async () => { - await db.add('doc-1', 'Original text', { id: 'doc-1' }); - - const largeText = 'b'.repeat(101); - await expect(db.update('doc-1', { text: largeText })).rejects.toThrow(DocumentValidationError); - await expect(db.update('doc-1', { text: largeText })).rejects.toThrow(/exceeds maximum size/); - }); - - it('should reject updateMany exceeding batch size', async () => { - // Add initial documents - for (let i = 0; i < 6; i++) { - await db.add(`doc-${i}`, `Document ${i}`, { id: `doc-${i}` }); - } - - const updates = Array.from({ length: 6 }, (_, i) => ({ - id: `doc-${i}`, - text: `Updated ${i}`, - })); - - await expect(db.updateMany(updates)).rejects.toThrow(DocumentValidationError); - await expect(db.updateMany(updates)).rejects.toThrow(/Batch size exceeds maximum/); - }); - }); - - describe('Cryptographic Hashing', () => { - it('should use SHA-256 for hashing instead of weak hash', () => { - const input = 'test-input-string'; - const hash = SerializationUtils.hash(input); - - // SHA-256 produces a hex string, our implementation takes first 16 chars - expect(hash).toHaveLength(16); - expect(hash).toMatch(/^[0-9a-f]+$/); - - // Verify it matches SHA-256 - const expectedHash = createHash('sha256').update(input, 'utf8').digest('hex').substring(0, 16); - expect(hash).toBe(expectedHash); - }); - - it('should produce consistent hashes', () => { - const input = 'test-consistency'; - const hash1 = SerializationUtils.hash(input); - const hash2 = SerializationUtils.hash(input); - - expect(hash1).toBe(hash2); - }); - - it('should produce different hashes for different inputs', () => { - const hash1 = SerializationUtils.hash('input-1'); - const hash2 = SerializationUtils.hash('input-2'); - - expect(hash1).not.toBe(hash2); - }); - }); - - describe('Redis Key Sanitization', () => { - // Helper function to create a mock Redis client - const createMockRedisClient = () => ({ - get: async () => null, - set: async () => 'OK', - setex: async () => 'OK', - del: async () => 1, - ping: async () => 'PONG', - quit: async () => { - /* no-op for test mock */ - }, - }); - - it('should sanitize namespace with newline characters', () => { - const mockClient = createMockRedisClient(); - const maliciousNamespace = 'test\nFLUSHDB\n'; - const adapter = new RedisStorageAdapter({ - client: mockClient, - namespace: maliciousNamespace, - }); - - const redisKey = (adapter as any).redisKey; - - // Verify newlines were removed (FLUSHDB letters remain but can't be injected as command without newlines) - expect(redisKey).not.toContain('\n'); - expect(redisKey).not.toContain('\r'); - // The key should be a single line, preventing command injection - expect(redisKey.split('\n').length).toBe(1); - }); - - it('should sanitize namespace with carriage returns', () => { - const mockClient = createMockRedisClient(); - const maliciousNamespace = 'test\rDEL *\r'; - const adapter = new RedisStorageAdapter({ - client: mockClient, - namespace: maliciousNamespace, - }); - - const redisKey = (adapter as any).redisKey; - - // Verify carriage returns were removed - expect(redisKey).not.toContain('\r'); - // The key should be a single line, preventing command injection - expect(redisKey.split('\r').length).toBe(1); - }); - - it('should only allow safe characters in namespace', () => { - const mockClient = createMockRedisClient(); - const unsafeNamespace = 'test!@#$%^&*()+=[]{}|;\'",<>?/\\`~'; - const adapter = new RedisStorageAdapter({ - client: mockClient, - namespace: unsafeNamespace, - }); - - const redisKey = (adapter as any).redisKey; - - // Should only contain word characters, colons, dots, and dashes - expect(redisKey).toMatch(/^[\w:.-]+$/); - }); - }); - - describe('Regular Expression Safety', () => { - it('should detect nested quantifiers as vulnerable', () => { - const vulnerablePattern = '(a+)+$'; - expect(isPotentiallyVulnerableRegex(vulnerablePattern)).toBe(true); - }); - - it('should detect alternation with quantifiers as vulnerable', () => { - const vulnerablePattern = '(a|ab)*$'; - expect(isPotentiallyVulnerableRegex(vulnerablePattern)).toBe(true); - }); - - it('should not flag safe patterns as vulnerable', () => { - const safePattern = '[a-zA-Z0-9]+'; - expect(isPotentiallyVulnerableRegex(safePattern)).toBe(false); - }); - - it('should provide safe regex patterns', () => { - // Verify SAFE_PATTERNS are actually safe - expect(isPotentiallyVulnerableRegex(SAFE_PATTERNS.CONTROL_CHARS.source)).toBe(false); - expect(isPotentiallyVulnerableRegex(SAFE_PATTERNS.PATH_SEPARATORS.source)).toBe(false); - expect(isPotentiallyVulnerableRegex(SAFE_PATTERNS.ALPHANUMERIC_SAFE.source)).toBe(false); - }); - }); - - describe('Error Message Configuration', () => { - it('should support verboseErrors configuration', async () => { - const dbVerbose = new VectoriaDB({ verboseErrors: true }); - const dbProduction = new VectoriaDB({ verboseErrors: false }); - - await dbVerbose.initialize(); - await dbProduction.initialize(); - - // Both should work the same, configuration is just for user reference - expect(dbVerbose['config'].verboseErrors).toBe(true); - expect(dbProduction['config'].verboseErrors).toBe(false); - - await dbVerbose.close(); - await dbProduction.close(); - }); - }); - - describe('Security Integration', () => { - it('should handle malicious input across multiple vectors simultaneously', async () => { - // Create DB with all security limits - const db = new VectoriaDB({ - maxDocuments: 5, - maxDocumentSize: 50, - maxBatchSize: 3, - verboseErrors: false, - }); - - await db.initialize(); - - // Try various attacks that should all be blocked - const attacks = [ - // DoS via oversized document - db.add('attack-1', 'x'.repeat(1000), { id: 'attack-1' }).catch((e) => e), - - // DoS via batch size - db - .addMany( - Array.from({ length: 10 }, (_, i) => ({ - id: `attack-${i}`, - text: 'test', - metadata: { id: `attack-${i}` }, - })), - ) - .catch((e) => e), - ]; - - const results = await Promise.all(attacks); - - // All attacks should be rejected - expect(results.every((r) => r instanceof Error)).toBe(true); - - await db.close(); - }); - }); -}); diff --git a/libs/vectoriadb/src/__tests__/similarity.spec.ts b/libs/vectoriadb/src/__tests__/similarity.spec.ts deleted file mode 100644 index db5362e..0000000 --- a/libs/vectoriadb/src/__tests__/similarity.spec.ts +++ /dev/null @@ -1,136 +0,0 @@ -import { cosineSimilarity, normalizeVector, euclideanDistance, dotProduct } from '../similarity.utils'; - -describe('Similarity Utils', () => { - describe('cosineSimilarity', () => { - test('should return 1 for identical vectors', () => { - const a = new Float32Array([1, 2, 3, 4]); - const b = new Float32Array([1, 2, 3, 4]); - - const similarity = cosineSimilarity(a, b); - expect(similarity).toBeCloseTo(1.0, 5); - }); - - test('should return 0 for orthogonal vectors', () => { - const a = new Float32Array([1, 0, 0]); - const b = new Float32Array([0, 1, 0]); - - const similarity = cosineSimilarity(a, b); - expect(similarity).toBeCloseTo(0.0, 5); - }); - - test('should return -1 for opposite vectors', () => { - const a = new Float32Array([1, 0, 0]); - const b = new Float32Array([-1, 0, 0]); - - const similarity = cosineSimilarity(a, b); - expect(similarity).toBeCloseTo(-1.0, 5); - }); - - test('should handle similar but not identical vectors', () => { - const a = new Float32Array([1, 2, 3]); - const b = new Float32Array([1, 2, 3.1]); - - const similarity = cosineSimilarity(a, b); - expect(similarity).toBeGreaterThan(0.99); - expect(similarity).toBeLessThan(1.0); - }); - - test('should throw error for vectors of different dimensions', () => { - const a = new Float32Array([1, 2, 3]); - const b = new Float32Array([1, 2]); - - expect(() => cosineSimilarity(a, b)).toThrow("Vector dimensions don't match"); - }); - - test('should return 0 for zero vectors', () => { - const a = new Float32Array([0, 0, 0]); - const b = new Float32Array([1, 2, 3]); - - const similarity = cosineSimilarity(a, b); - expect(similarity).toBe(0); - }); - }); - - describe('normalizeVector', () => { - test('should normalize a vector to unit length', () => { - const vector = new Float32Array([3, 4]); - const normalized = normalizeVector(vector); - - // Length should be 1 - const length = Math.sqrt(normalized[0] * normalized[0] + normalized[1] * normalized[1]); - expect(length).toBeCloseTo(1.0, 5); - - // Direction should be preserved - expect(normalized[0]).toBeCloseTo(0.6, 5); - expect(normalized[1]).toBeCloseTo(0.8, 5); - }); - - test('should handle already normalized vector', () => { - const vector = new Float32Array([1, 0, 0]); - const normalized = normalizeVector(vector); - - expect(normalized[0]).toBeCloseTo(1.0, 5); - expect(normalized[1]).toBeCloseTo(0.0, 5); - expect(normalized[2]).toBeCloseTo(0.0, 5); - }); - - test('should handle zero vector', () => { - const vector = new Float32Array([0, 0, 0]); - const normalized = normalizeVector(vector); - - expect(normalized[0]).toBe(0); - expect(normalized[1]).toBe(0); - expect(normalized[2]).toBe(0); - }); - }); - - describe('euclideanDistance', () => { - test('should calculate distance between identical vectors as 0', () => { - const a = new Float32Array([1, 2, 3]); - const b = new Float32Array([1, 2, 3]); - - const distance = euclideanDistance(a, b); - expect(distance).toBeCloseTo(0, 5); - }); - - test('should calculate distance correctly', () => { - const a = new Float32Array([0, 0]); - const b = new Float32Array([3, 4]); - - const distance = euclideanDistance(a, b); - expect(distance).toBeCloseTo(5.0, 5); // 3-4-5 triangle - }); - - test('should throw error for vectors of different dimensions', () => { - const a = new Float32Array([1, 2, 3]); - const b = new Float32Array([1, 2]); - - expect(() => euclideanDistance(a, b)).toThrow("Vector dimensions don't match"); - }); - }); - - describe('dotProduct', () => { - test('should calculate dot product correctly', () => { - const a = new Float32Array([1, 2, 3]); - const b = new Float32Array([4, 5, 6]); - - const result = dotProduct(a, b); - expect(result).toBe(32); // 1*4 + 2*5 + 3*6 = 4 + 10 + 18 = 32 - }); - - test('should return 0 for orthogonal vectors', () => { - const a = new Float32Array([1, 0, 0]); - const b = new Float32Array([0, 1, 0]); - - const result = dotProduct(a, b); - expect(result).toBe(0); - }); - - test('should throw error for vectors of different dimensions', () => { - const a = new Float32Array([1, 2, 3]); - const b = new Float32Array([1, 2]); - - expect(() => dotProduct(a, b)).toThrow("Vector dimensions don't match"); - }); - }); -}); diff --git a/libs/vectoriadb/src/__tests__/storage.spec.ts b/libs/vectoriadb/src/__tests__/storage.spec.ts deleted file mode 100644 index 7acd54b..0000000 --- a/libs/vectoriadb/src/__tests__/storage.spec.ts +++ /dev/null @@ -1,1211 +0,0 @@ -import { VectoriaDB } from '../vectoria'; -import { FileStorageAdapter } from '../storage/file.adapter'; -import { MemoryStorageAdapter } from '../storage/memory.adapter'; -import { RedisStorageAdapter } from '../storage/redis.adapter'; -import * as SerializationUtils from '../storage/serialization.utils'; -import { ConfigurationError, StorageError } from '../errors'; -import type { DocumentMetadata } from '../interfaces'; -import type { RedisClient } from '../storage/redis.adapter'; -import type { StoredData } from '../storage/adapter.interface'; -import * as fs from 'fs/promises'; -import * as path from 'path'; - -interface TestMetadata extends DocumentMetadata { - category: string; -} - -describe('Storage Adapters', () => { - const testCacheDir = './.cache/vectoriadb-test'; - - // Mock Redis client factory - shared across all Redis tests - const createMockRedisClient = (): RedisClient => { - const storage = new Map(); - - return { - async get(key: string) { - return storage.get(key) ?? null; - }, - async set(key: string, value: string) { - storage.set(key, value); - return 'OK'; - }, - async setex(key: string, _seconds: number, value: string) { - storage.set(key, value); - return 'OK'; - }, - async del(key: string) { - storage.delete(key); - return 1; - }, - async ping() { - return 'PONG'; - }, - async quit() { - // Connection close - data persists in Redis - }, - }; - }; - - afterEach(async () => { - // Cleanup test cache directory - try { - await fs.rm(testCacheDir, { recursive: true, force: true }); - } catch { - // Ignore errors - } - }); - - describe('SerializationUtils', () => { - test('should hash strings consistently', () => { - const hash1 = SerializationUtils.hash('test-string'); - const hash2 = SerializationUtils.hash('test-string'); - expect(hash1).toBe(hash2); - }); - - test('should create different hashes for different strings', () => { - const hash1 = SerializationUtils.hash('test-1'); - const hash2 = SerializationUtils.hash('test-2'); - expect(hash1).not.toBe(hash2); - }); - - test('should create tools hash from documents', () => { - const docs = [ - { id: 'doc-1', text: 'Test 1' }, - { id: 'doc-2', text: 'Test 2' }, - ]; - const hash = SerializationUtils.createToolsHash(docs); - expect(hash).toBeDefined(); - expect(typeof hash).toBe('string'); - }); - - test('should create same hash for same documents regardless of order', () => { - const docs1 = [ - { id: 'doc-1', text: 'Test 1' }, - { id: 'doc-2', text: 'Test 2' }, - ]; - const docs2 = [ - { id: 'doc-2', text: 'Test 2' }, - { id: 'doc-1', text: 'Test 1' }, - ]; - const hash1 = SerializationUtils.createToolsHash(docs1); - const hash2 = SerializationUtils.createToolsHash(docs2); - expect(hash1).toBe(hash2); - }); - }); - - describe('MemoryStorageAdapter', () => { - test('should not have cache on startup', async () => { - const adapter = new MemoryStorageAdapter(); - await adapter.initialize(); - - const hasCache = await adapter.hasValidCache({ - version: '1.0.0', - toolsHash: 'test', - timestamp: Date.now(), - modelName: 'test', - dimensions: 384, - documentCount: 0, - }); - - expect(hasCache).toBe(false); - }); - - test('should load null on first load', async () => { - const adapter = new MemoryStorageAdapter(); - await adapter.initialize(); - - const data = await adapter.load(); - expect(data).toBeNull(); - }); - }); - - describe('FileStorageAdapter', () => { - test('should create cache directory', async () => { - const adapter = new FileStorageAdapter({ - cacheDir: testCacheDir, - namespace: 'test-1', - }); - - await adapter.initialize(); - - const dir = path.join(testCacheDir, 'test-1'); - const exists = await fs - .access(dir) - .then(() => true) - .catch(() => false); - expect(exists).toBe(true); - }); - - test('should save and load data', async () => { - const adapter = new FileStorageAdapter({ - cacheDir: testCacheDir, - namespace: 'test-2', - }); - - await adapter.initialize(); - - const testData = { - metadata: { - version: '1.0.0', - toolsHash: 'abc123', - timestamp: Date.now(), - modelName: 'test-model', - dimensions: 384, - documentCount: 1, - }, - embeddings: [ - { - id: 'doc-1', - vector: [0.1, 0.2, 0.3], - metadata: { id: 'doc-1', category: 'test' }, - text: 'Test document', - createdAt: new Date().toISOString(), - }, - ], - }; - - await adapter.save(testData); - - const loaded = await adapter.load(); - expect(loaded).toBeDefined(); - expect(loaded?.embeddings.length).toBe(1); - expect(loaded?.embeddings[0].id).toBe('doc-1'); - }); - - test('should invalidate cache when tools hash changes', async () => { - const adapter = new FileStorageAdapter({ - cacheDir: testCacheDir, - namespace: 'test-3', - }); - - await adapter.initialize(); - - const testData = { - metadata: { - version: '1.0.0', - toolsHash: 'abc123', - timestamp: Date.now(), - modelName: 'test-model', - dimensions: 384, - documentCount: 1, - }, - embeddings: [], - }; - - await adapter.save(testData); - - // Try to load with different tools hash - const hasValid = await adapter.hasValidCache({ - version: '1.0.0', - toolsHash: 'different-hash', - timestamp: Date.now(), - modelName: 'test-model', - dimensions: 384, - documentCount: 1, - }); - - expect(hasValid).toBe(false); - }); - - test('should clear cache', async () => { - const adapter = new FileStorageAdapter({ - cacheDir: testCacheDir, - namespace: 'test-4', - }); - - await adapter.initialize(); - - const testData = { - metadata: { - version: '1.0.0', - toolsHash: 'abc123', - timestamp: Date.now(), - modelName: 'test-model', - dimensions: 384, - documentCount: 0, - }, - embeddings: [], - }; - - await adapter.save(testData); - await adapter.clear(); - - const loaded = await adapter.load(); - expect(loaded).toBeNull(); - }); - }); - - describe('RedisStorageAdapter', () => { - test('should initialize with Redis client', async () => { - const client = createMockRedisClient(); - const adapter = new RedisStorageAdapter({ - client, - namespace: 'test-redis-1', - }); - - await adapter.initialize(); - // No error means success - }); - - test('should save and load data from Redis', async () => { - const client = createMockRedisClient(); - const adapter = new RedisStorageAdapter({ - client, - namespace: 'test-redis-2', - }); - - await adapter.initialize(); - - const testData = { - metadata: { - version: '1.0.0', - toolsHash: 'abc123', - timestamp: Date.now(), - modelName: 'test-model', - dimensions: 384, - documentCount: 1, - }, - embeddings: [ - { - id: 'doc-1', - vector: [0.1, 0.2, 0.3], - metadata: { id: 'doc-1', category: 'test' }, - text: 'Test document', - createdAt: new Date().toISOString(), - }, - ], - }; - - await adapter.save(testData); - - const loaded = await adapter.load(); - expect(loaded).toBeDefined(); - expect(loaded?.embeddings.length).toBe(1); - expect(loaded?.embeddings[0].id).toBe('doc-1'); - }); - - test('should clear Redis cache', async () => { - const client = createMockRedisClient(); - const adapter = new RedisStorageAdapter({ - client, - namespace: 'test-redis-3', - }); - - await adapter.initialize(); - - const testData = { - metadata: { - version: '1.0.0', - toolsHash: 'abc123', - timestamp: Date.now(), - modelName: 'test-model', - dimensions: 384, - documentCount: 0, - }, - embeddings: [], - }; - - await adapter.save(testData); - await adapter.clear(); - - const loaded = await adapter.load(); - expect(loaded).toBeNull(); - }); - }); - - describe('VectoriaDB with Storage', () => { - describe('with FileStorageAdapter', () => { - test('should save and restore embeddings across restarts', async () => { - const docs = [ - { id: 'doc-1', text: 'Machine learning basics', metadata: { id: 'doc-1', category: 'tech' } }, - { id: 'doc-2', text: 'Cooking pasta', metadata: { id: 'doc-2', category: 'food' } }, - ]; - - const toolsHash = SerializationUtils.createToolsHash(docs); - - // First instance - create and save - const db1 = new VectoriaDB({ - storageAdapter: new FileStorageAdapter({ - cacheDir: testCacheDir, - namespace: 'vectoria-test-1', - }), - toolsHash, - version: '1.0.0', - }); - - await db1.initialize(); - await db1.addMany(docs); - await db1.saveToStorage(); - - expect(db1.size()).toBe(2); - - // Second instance - load from storage - const db2 = new VectoriaDB({ - storageAdapter: new FileStorageAdapter({ - cacheDir: testCacheDir, - namespace: 'vectoria-test-1', - }), - toolsHash, - version: '1.0.0', - }); - - await db2.initialize(); - - expect(db2.size()).toBe(2); - expect(db2.has('doc-1')).toBe(true); - expect(db2.has('doc-2')).toBe(true); - - const doc1 = db2.get('doc-1'); - expect(doc1?.text).toBe('Machine learning basics'); - }, 60000); - - test('should invalidate cache when tools change', async () => { - const docs1 = [{ id: 'doc-1', text: 'Original text', metadata: { id: 'doc-1', category: 'test' } }]; - - const toolsHash1 = SerializationUtils.createToolsHash(docs1); - - // First instance - const db1 = new VectoriaDB({ - storageAdapter: new FileStorageAdapter({ - cacheDir: testCacheDir, - namespace: 'vectoria-test-2', - }), - toolsHash: toolsHash1, - version: '1.0.0', - }); - - await db1.initialize(); - await db1.addMany(docs1); - await db1.saveToStorage(); - - expect(db1.size()).toBe(1); - - // Second instance with different tools - const docs2 = [{ id: 'doc-1', text: 'Changed text', metadata: { id: 'doc-1', category: 'test' } }]; - - const toolsHash2 = SerializationUtils.createToolsHash(docs2); - - const db2 = new VectoriaDB({ - storageAdapter: new FileStorageAdapter({ - cacheDir: testCacheDir, - namespace: 'vectoria-test-2', - }), - toolsHash: toolsHash2, - version: '1.0.0', - }); - - await db2.initialize(); - - // Cache should be invalidated, database should be empty - expect(db2.size()).toBe(0); - }, 60000); - - test('should work with HNSW index after restore', async () => { - const docs = [ - { id: 'doc-1', text: 'Machine learning', metadata: { id: 'doc-1', category: 'tech' } }, - { id: 'doc-2', text: 'Deep learning', metadata: { id: 'doc-2', category: 'tech' } }, - { id: 'doc-3', text: 'Cooking', metadata: { id: 'doc-3', category: 'food' } }, - ]; - - const toolsHash = SerializationUtils.createToolsHash(docs); - - // First instance with HNSW - const db1 = new VectoriaDB({ - useHNSW: true, - storageAdapter: new FileStorageAdapter({ - cacheDir: testCacheDir, - namespace: 'vectoria-hnsw-test', - }), - toolsHash, - version: '1.0.0', - }); - - await db1.initialize(); - await db1.addMany(docs); - await db1.saveToStorage(); - - // Second instance - restore with HNSW - const db2 = new VectoriaDB({ - useHNSW: true, - storageAdapter: new FileStorageAdapter({ - cacheDir: testCacheDir, - namespace: 'vectoria-hnsw-test', - }), - toolsHash, - version: '1.0.0', - }); - - await db2.initialize(); - - expect(db2.size()).toBe(3); - - // HNSW search should work - verify we can search after restore - const results = await db2.search('learning', { threshold: 0 }); - expect(results.length).toBeGreaterThan(0); // Should find at least the "learning" documents - - // Verify the tech documents are found - const techDocs = results.filter((r) => r.metadata.category === 'tech'); - expect(techDocs.length).toBeGreaterThan(0); - }, 60000); - }); - }); - - describe('Storage Adapter Error Handling', () => { - describe('MemoryStorageAdapter error scenarios', () => { - it('should handle clear() method', async () => { - const adapter = new MemoryStorageAdapter(); - await adapter.initialize(); - - const testData = { - metadata: { - version: '1.0.0', - toolsHash: 'test', - timestamp: Date.now(), - modelName: 'test-model', - dimensions: 384, - documentCount: 1, - }, - embeddings: [], - }; - - await adapter.save(testData); - await adapter.clear(); - - const loaded = await adapter.load(); - expect(loaded).toBeNull(); - }); - - it('should handle close() method', async () => { - const adapter = new MemoryStorageAdapter(); - await adapter.initialize(); - await adapter.close(); - - // Should still work after close (in-memory has no cleanup) - const loaded = await adapter.load(); - expect(loaded).toBeNull(); - }); - }); - - describe('FileStorageAdapter error scenarios', () => { - const errorTestDir = './tmp/error-test-cache'; - - afterEach(async () => { - try { - await fs.rm(errorTestDir, { recursive: true, force: true }); - } catch { - // Ignore cleanup errors - } - }); - - it('should throw StorageError on initialize with invalid cache directory', async () => { - const adapter = new FileStorageAdapter({ - cacheDir: '/invalid/path/that/does/not/exist', - namespace: 'test', - }); - - // Should throw StorageError when trying to create invalid directory - await expect(adapter.initialize()).rejects.toThrow(StorageError); - await expect(adapter.initialize()).rejects.toThrow('Failed to create cache directory'); - }); - - it('should handle load from non-existent file', async () => { - const adapter = new FileStorageAdapter({ - cacheDir: errorTestDir, - namespace: 'non-existent', - }); - - await adapter.initialize(); - const loaded = await adapter.load(); - - expect(loaded).toBeNull(); - }); - - it('should handle corrupted JSON file', async () => { - const adapter = new FileStorageAdapter({ - cacheDir: errorTestDir, - namespace: 'corrupted', - }); - - await adapter.initialize(); - - // Create corrupted JSON file - const filePath = path.join(errorTestDir, 'corrupted', 'embeddings.json'); - await fs.mkdir(path.dirname(filePath), { recursive: true }); - await fs.writeFile(filePath, 'invalid json content{{{'); - - const loaded = await adapter.load(); - expect(loaded).toBeNull(); - }); - - it('should handle clear() when file does not exist', async () => { - const adapter = new FileStorageAdapter({ - cacheDir: errorTestDir, - namespace: 'missing', - }); - - await adapter.initialize(); - - // Should not throw when clearing non-existent file - await expect(adapter.clear()).resolves.not.toThrow(); - }); - - it('should handle close() method', async () => { - const adapter = new FileStorageAdapter({ - cacheDir: errorTestDir, - namespace: 'close-test', - }); - - await adapter.initialize(); - await expect(adapter.close()).resolves.not.toThrow(); - }); - }); - - describe('RedisStorageAdapter error scenarios', () => { - it('should handle connection failure during initialization', async () => { - const failingClient: RedisClient = { - get: async () => null, - set: async () => 'OK', - setex: async () => 'OK', - del: async () => 1, - ping: async () => { - throw new Error('Connection refused'); - }, - quit: async () => { - /* no-op for test mock */ - }, - }; - - const adapter = new RedisStorageAdapter({ - client: failingClient, - namespace: 'test', - }); - - await expect(adapter.initialize()).rejects.toThrow(/Failed to connect to Redis/); - }); - - it('should handle save errors', async () => { - const failingClient: RedisClient = { - get: async () => null, - set: async () => 'OK', - setex: async () => { - throw new Error('Save failed'); - }, - del: async () => 1, - ping: async () => 'PONG', - quit: async () => { - /* no-op for test mock */ - }, - }; - - const adapter = new RedisStorageAdapter({ - client: failingClient, - namespace: 'test', - }); - - await adapter.initialize(); - - const testData = { - metadata: { - version: '1.0.0', - toolsHash: 'test', - timestamp: Date.now(), - modelName: 'test-model', - dimensions: 384, - documentCount: 1, - }, - embeddings: [], - }; - - await expect(adapter.save(testData)).rejects.toThrow(/Failed to save embeddings to Redis/); - }); - - it('should handle load errors', async () => { - const failingClient: RedisClient = { - get: async () => { - throw new Error('Get failed'); - }, - set: async () => 'OK', - setex: async () => 'OK', - del: async () => 1, - ping: async () => 'PONG', - quit: async () => { - /* no-op for test mock */ - }, - }; - - const adapter = new RedisStorageAdapter({ - client: failingClient, - namespace: 'test', - }); - - await adapter.initialize(); - - const loaded = await adapter.load(); - expect(loaded).toBeNull(); - }); - - it('should handle invalid JSON from Redis', async () => { - const invalidJsonClient: RedisClient = { - get: async () => 'invalid json{{{', - set: async () => 'OK', - setex: async () => 'OK', - del: async () => 1, - ping: async () => 'PONG', - quit: async () => { - /* no-op for test mock */ - }, - }; - - const adapter = new RedisStorageAdapter({ - client: invalidJsonClient, - namespace: 'test', - }); - - await adapter.initialize(); - - const loaded = await adapter.load(); - expect(loaded).toBeNull(); - }); - - it('should handle clear errors gracefully', async () => { - const failingClient: RedisClient = { - get: async () => null, - set: async () => 'OK', - setex: async () => 'OK', - del: async () => { - throw new Error('Delete failed'); - }, - ping: async () => 'PONG', - quit: async () => { - /* no-op for test mock */ - }, - }; - - const adapter = new RedisStorageAdapter({ - client: failingClient, - namespace: 'test', - }); - - await adapter.initialize(); - - // Should not throw even if delete fails - await expect(adapter.clear()).resolves.not.toThrow(); - }); - - it('should handle close errors gracefully', async () => { - const failingClient: RedisClient = { - get: async () => null, - set: async () => 'OK', - setex: async () => 'OK', - del: async () => 1, - ping: async () => 'PONG', - quit: async () => { - throw new Error('Quit failed'); - }, - }; - - const adapter = new RedisStorageAdapter({ - client: failingClient, - namespace: 'test', - }); - - await adapter.initialize(); - - // Should not throw even if quit fails - await expect(adapter.close()).resolves.not.toThrow(); - }); - - it('should work with custom TTL', async () => { - let savedTTL: number | undefined; - const customTTLClient: RedisClient = { - get: async () => null, - set: async () => 'OK', - setex: async (key, ttl, value) => { - savedTTL = ttl; - return 'OK'; - }, - del: async () => 1, - ping: async () => 'PONG', - quit: async () => { - /* no-op for test mock */ - }, - }; - - const adapter = new RedisStorageAdapter({ - client: customTTLClient, - namespace: 'test', - ttl: 3600, // 1 hour - }); - - await adapter.initialize(); - - const testData = { - metadata: { - version: '1.0.0', - toolsHash: 'test', - timestamp: Date.now(), - modelName: 'test-model', - dimensions: 384, - documentCount: 1, - }, - embeddings: [], - }; - - await adapter.save(testData); - expect(savedTTL).toBe(3600); - }); - - it('should work with custom key prefix', async () => { - let savedKey: string | undefined; - const customPrefixClient: RedisClient = { - get: async (key) => { - savedKey = key; - return null; - }, - set: async () => 'OK', - setex: async (key) => { - savedKey = key; - return 'OK'; - }, - del: async () => 1, - ping: async () => 'PONG', - quit: async () => { - /* no-op for test mock */ - }, - }; - - const adapter = new RedisStorageAdapter({ - client: customPrefixClient, - namespace: 'myapp', - keyPrefix: 'custom-prefix', - }); - - await adapter.initialize(); - await adapter.load(); - - expect(savedKey).toContain('custom-prefix'); - expect(savedKey).toContain('myapp'); - }); - }); - }); - - describe('BaseStorageAdapter Coverage', () => { - describe('isMetadataValid', () => { - it('should return false when version mismatch', async () => { - const client = createMockRedisClient(); - const adapter = new RedisStorageAdapter({ - client, - namespace: 'test-version-mismatch', - }); - - await adapter.initialize(); - - // Save data with version 1 - const testData: StoredData = { - embeddings: [], - metadata: { - version: '1.0.0', - modelName: 'test-model', - toolsHash: 'hash123', - timestamp: Date.now(), - dimensions: 384, - documentCount: 0, - }, - }; - await adapter.save(testData); - - // Check cache with different version - const hasCache = await adapter.hasValidCache({ - version: '2.0.0', - modelName: 'test-model', - toolsHash: 'hash123', - timestamp: Date.now(), - dimensions: 384, - documentCount: 0, - }); - - expect(hasCache).toBe(false); - }); - - it('should return false when toolsHash mismatch', async () => { - const client = createMockRedisClient(); - const adapter = new RedisStorageAdapter({ - client, - namespace: 'test-tools-mismatch', - }); - - await adapter.initialize(); - - // Save data with toolsHash - const testData: StoredData = { - embeddings: [], - metadata: { - version: '1.0.0', - modelName: 'test-model', - toolsHash: 'hash123', - timestamp: Date.now(), - dimensions: 384, - documentCount: 0, - }, - }; - await adapter.save(testData); - - // Check cache with different toolsHash - const hasCache = await adapter.hasValidCache({ - version: '1.0.0', - modelName: 'test-model', - toolsHash: 'hash456', - timestamp: Date.now(), - dimensions: 384, - documentCount: 0, - }); - - expect(hasCache).toBe(false); - }); - - it('should return false when modelName mismatch', async () => { - const client = createMockRedisClient(); - const adapter = new RedisStorageAdapter({ - client, - namespace: 'test-model-mismatch', - }); - - await adapter.initialize(); - - // Save data with modelName - const testData: StoredData = { - embeddings: [], - metadata: { - version: '1.0.0', - modelName: 'test-model', - toolsHash: 'hash123', - timestamp: Date.now(), - dimensions: 384, - documentCount: 0, - }, - }; - await adapter.save(testData); - - // Check cache with different modelName - const hasCache = await adapter.hasValidCache({ - version: '1.0.0', - modelName: 'different-model', - toolsHash: 'hash123', - timestamp: Date.now(), - dimensions: 384, - documentCount: 0, - }); - - expect(hasCache).toBe(false); - }); - - it('should handle JSON with prototype pollution keys', async () => { - const adapter = new FileStorageAdapter({ - cacheDir: './tmp/json-parse-test', - namespace: 'test-json', - }); - - try { - await adapter.initialize(); - - // Manually write JSON with prototype pollution attempt - const filePath = path.join('./tmp/json-parse-test', 'test-json', 'embeddings.json'); - await fs.mkdir(path.dirname(filePath), { recursive: true }); - await fs.writeFile( - filePath, - JSON.stringify({ - __proto__: { polluted: true }, - constructor: { bad: 'value' }, - prototype: { evil: 'data' }, - embeddings: [], - metadata: { - version: '1.0.0', - modelName: 'test', - toolsHash: 'hash', - createdAt: new Date().toISOString(), - }, - }), - ); - - // Load should sanitize and not include prototype pollution - const data = await adapter.load(); - - expect(data).not.toBeNull(); - expect((data as any).__proto__).toBeUndefined(); - expect((data as any).constructor).toBeUndefined(); - expect((data as any).prototype).toBeUndefined(); - - // Cleanup - await fs.rm('./tmp/json-parse-test', { recursive: true, force: true }); - } catch { - // Cleanup on error - try { - await fs.rm('./tmp/json-parse-test', { recursive: true, force: true }); - } catch { - // Ignore cleanup errors - } - } - }); - - it('should handle invalid JSON gracefully', async () => { - const adapter = new FileStorageAdapter({ - cacheDir: './tmp/invalid-json-test', - namespace: 'test-invalid', - }); - - try { - await adapter.initialize(); - - // Write invalid JSON - const filePath = path.join('./tmp/invalid-json-test', 'test-invalid', 'embeddings.json'); - await fs.mkdir(path.dirname(filePath), { recursive: true }); - await fs.writeFile(filePath, '{invalid json content}'); - - // Load should return null for invalid JSON - const data = await adapter.load(); - - expect(data).toBeNull(); - - // Cleanup - await fs.rm('./tmp/invalid-json-test', { recursive: true, force: true }); - } catch { - // Cleanup on error - try { - await fs.rm('./tmp/invalid-json-test', { recursive: true, force: true }); - } catch { - // Ignore cleanup errors - } - } - }); - }); - }); - - describe('FileStorageAdapter Edge Cases', () => { - it('should sanitize path traversal attempts', () => { - // Path traversal is sanitized, not rejected - const adapter = new FileStorageAdapter({ - cacheDir: '/safe/dir', - namespace: '../../../etc/passwd', // Path traversal attempt - }); - - // The adapter should be created successfully with sanitized namespace - expect(adapter).toBeDefined(); - - // Verify the filePath doesn't contain path traversal - const filePath = (adapter as any).filePath; - expect(filePath).toContain('/safe/dir'); - expect(filePath).not.toContain('..'); - expect(filePath).not.toContain('etc/passwd'); - }); - }); - - describe('RedisStorageAdapter Edge Cases', () => { - it('should throw ConfigurationError for empty namespace', () => { - const client = createMockRedisClient(); - - expect(() => { - new RedisStorageAdapter({ - client, - namespace: '', // Empty namespace - }); - }).toThrow(ConfigurationError); - expect(() => { - new RedisStorageAdapter({ - client, - namespace: '', // Empty namespace - }); - }).toThrow('Namespace must be a non-empty string'); - }); - - it('should use default namespace when null or undefined', () => { - const client = createMockRedisClient(); - - // null/undefined namespace gets default value from base class - const adapter1 = new RedisStorageAdapter({ - client, - namespace: null as any, - }); - expect(adapter1).toBeDefined(); - - const adapter2 = new RedisStorageAdapter({ - client, - namespace: undefined as any, - }); - expect(adapter2).toBeDefined(); - }); - - it('should throw ConfigurationError for namespace that becomes empty after sanitization', () => { - const client = createMockRedisClient(); - - expect(() => { - new RedisStorageAdapter({ - client, - namespace: '!!!@@@###', // Will be empty after sanitization - }); - }).toThrow(ConfigurationError); - expect(() => { - new RedisStorageAdapter({ - client, - namespace: '!!!@@@###', - }); - }).toThrow('Namespace becomes empty after sanitization'); - }); - }); - - describe('Error Handling - Serialization Failures', () => { - describe('FileStorageAdapter serialization error handling', () => { - it('should throw StorageError when safeJsonStringify returns empty', async () => { - const adapter = new FileStorageAdapter({ - cacheDir: './tmp/serialization-error-test', - namespace: 'test-serialization', - }); - await adapter.initialize(); - - // Create data with BigInt which cannot be serialized by JSON.stringify - const invalidData: any = { - embeddings: [], - metadata: { - version: '1.0.0', - modelName: 'test', - toolsHash: 'hash', - timestamp: Date.now(), - dimensions: 384, - documentCount: 0, - invalidValue: BigInt(9007199254740991), // BigInt cannot be JSON.stringify'd - }, - }; - - await expect(adapter.save(invalidData)).rejects.toThrow(StorageError); - await expect(adapter.save(invalidData)).rejects.toThrow('Failed to serialize embeddings data'); - - // Cleanup - try { - await fs.rm('./tmp/serialization-error-test', { recursive: true, force: true }); - } catch { - // Ignore cleanup errors - } - }); - - it('should not double-wrap StorageError', async () => { - const adapter = new FileStorageAdapter({ - cacheDir: './tmp/double-wrap-test', - namespace: 'test-double-wrap', - }); - await adapter.initialize(); - - const invalidData: any = { - embeddings: [], - metadata: { - version: '1.0.0', - modelName: 'test', - toolsHash: 'hash', - timestamp: Date.now(), - dimensions: 384, - documentCount: 0, - invalidValue: BigInt(123), - }, - }; - - try { - await adapter.save(invalidData); - fail('Should have thrown StorageError'); - } catch (error) { - expect(error).toBeInstanceOf(StorageError); - // Verify the error message is the original one, not wrapped - expect((error as StorageError).message).toBe('Failed to serialize embeddings data'); - // Verify there's no nested "Failed to save embeddings to file" message - expect((error as StorageError).message).not.toContain('Failed to save embeddings to file'); - } - - // Cleanup - try { - await fs.rm('./tmp/double-wrap-test', { recursive: true, force: true }); - } catch { - // Ignore cleanup errors - } - }); - - it('should validate that sanitization prevents path traversal', () => { - // The sanitizeNamespace() method removes dangerous path traversal sequences - // This test verifies that path traversal attempts are neutralized - const adapter = new FileStorageAdapter({ - cacheDir: '/tmp/safe-dir', - namespace: '../../../etc/passwd', - }); - - // The adapter should be created successfully (sanitization works) - expect(adapter).toBeDefined(); - - // The actual file path should not contain any path traversal sequences - const filePath = (adapter as any).filePath; - expect(filePath).toContain('/tmp/safe-dir'); - expect(filePath).not.toContain('..'); - expect(filePath).not.toContain('etc/passwd'); - - // Note: The validateFilePath() method is a defensive check that's hard to trigger - // because sanitizeNamespace() removes all dangerous characters first - }); - }); - - describe('RedisStorageAdapter serialization error handling', () => { - it('should throw StorageError when safeJsonStringify returns empty', async () => { - const client = createMockRedisClient(); - const adapter = new RedisStorageAdapter({ - client, - namespace: 'test-redis-serialization', - }); - await adapter.initialize(); - - // Create data with BigInt which cannot be serialized - const invalidData: any = { - embeddings: [], - metadata: { - version: '1.0.0', - modelName: 'test', - toolsHash: 'hash', - timestamp: Date.now(), - dimensions: 384, - documentCount: 0, - invalidValue: BigInt(9007199254740991), - }, - }; - - await expect(adapter.save(invalidData)).rejects.toThrow(StorageError); - await expect(adapter.save(invalidData)).rejects.toThrow('Failed to serialize embeddings data'); - }); - - it('should not double-wrap StorageError', async () => { - const client = createMockRedisClient(); - const adapter = new RedisStorageAdapter({ - client, - namespace: 'test-redis-double-wrap', - }); - await adapter.initialize(); - - const invalidData: any = { - embeddings: [], - metadata: { - version: '1.0.0', - modelName: 'test', - toolsHash: 'hash', - timestamp: Date.now(), - dimensions: 384, - documentCount: 0, - invalidValue: BigInt(456), - }, - }; - - try { - await adapter.save(invalidData); - fail('Should have thrown StorageError'); - } catch (error) { - expect(error).toBeInstanceOf(StorageError); - // Verify the error message is the original one, not wrapped - expect((error as StorageError).message).toBe('Failed to serialize embeddings data'); - // Verify there's no nested "Failed to save embeddings to Redis" message - expect((error as StorageError).message).not.toContain('Failed to save embeddings to Redis'); - } - }); - }); - }); -}); diff --git a/libs/vectoriadb/src/__tests__/vectoria.spec.ts b/libs/vectoriadb/src/__tests__/vectoria.spec.ts deleted file mode 100644 index 2e35d1d..0000000 --- a/libs/vectoriadb/src/__tests__/vectoria.spec.ts +++ /dev/null @@ -1,1150 +0,0 @@ -import { VectoriaDB } from '../vectoria'; -import { DocumentMetadata } from '../interfaces'; -import { - VectoriaNotInitializedError, - DocumentValidationError, - DocumentNotFoundError, - DocumentExistsError, - DuplicateDocumentError, - QueryValidationError, -} from '../errors'; - -interface TestMetadata extends DocumentMetadata { - category: string; - author?: string; - tags?: string[]; -} - -describe('VectoriaDB', () => { - let db: VectoriaDB; - - beforeAll(async () => { - db = new VectoriaDB(); - await db.initialize(); - }, 60000); - - afterEach(() => { - db.clear(); - }); - - describe('initialization', () => { - test('should initialize successfully', () => { - expect(db.isInitialized()).toBe(true); - }); - - test('should start with zero documents', () => { - expect(db.size()).toBe(0); - }); - }); - - describe('add', () => { - test('should add a document', async () => { - await db.add('doc-1', 'Test document', { - id: 'doc-1', - category: 'test', - }); - - expect(db.size()).toBe(1); - expect(db.has('doc-1')).toBe(true); - }); - - test('should store document with metadata', async () => { - const metadata: TestMetadata = { - id: 'doc-1', - category: 'test', - author: 'Alice', - tags: ['tag1', 'tag2'], - }; - - await db.add('doc-1', 'Test document', metadata); - - const doc = db.get('doc-1'); - expect(doc).toBeDefined(); - expect(doc?.metadata).toEqual(metadata); - expect(doc?.text).toBe('Test document'); - }); - - test('should throw error when adding duplicate document id', async () => { - await db.add('doc-1', 'First version', { - id: 'doc-1', - category: 'test', - }); - - await expect( - db.add('doc-1', 'Second version', { - id: 'doc-1', - category: 'updated', - }), - ).rejects.toThrow(DocumentExistsError); - - // Original document should remain unchanged - expect(db.size()).toBe(1); - const doc = db.get('doc-1'); - expect(doc?.text).toBe('First version'); - expect(doc?.metadata.category).toBe('test'); - }); - }); - - describe('addMany', () => { - test('should add multiple documents', async () => { - const docs = [ - { - id: 'doc-1', - text: 'Document 1', - metadata: { id: 'doc-1', category: 'test' }, - }, - { - id: 'doc-2', - text: 'Document 2', - metadata: { id: 'doc-2', category: 'test' }, - }, - { - id: 'doc-3', - text: 'Document 3', - metadata: { id: 'doc-3', category: 'other' }, - }, - ]; - - await db.addMany(docs); - - expect(db.size()).toBe(3); - expect(db.has('doc-1')).toBe(true); - expect(db.has('doc-2')).toBe(true); - expect(db.has('doc-3')).toBe(true); - }); - - test('should handle empty array', async () => { - await db.addMany([]); - expect(db.size()).toBe(0); - }); - }); - - describe('search', () => { - beforeEach(async () => { - await db.addMany([ - { - id: 'doc-1', - text: 'Create a new user account in the system', - metadata: { - id: 'doc-1', - category: 'auth', - tags: ['user', 'create'], - }, - }, - { - id: 'doc-2', - text: 'Delete an existing user account', - metadata: { - id: 'doc-2', - category: 'auth', - tags: ['user', 'delete'], - }, - }, - { - id: 'doc-3', - text: 'Send email notifications to users', - metadata: { - id: 'doc-3', - category: 'notification', - tags: ['email', 'notify'], - }, - }, - { - id: 'doc-4', - text: 'Upload files to cloud storage', - metadata: { - id: 'doc-4', - category: 'storage', - tags: ['file', 'upload'], - }, - }, - ]); - }); - - test('should find relevant documents', async () => { - const results = await db.search('creating new accounts'); - - expect(results.length).toBeGreaterThan(0); - expect(results[0].id).toBe('doc-1'); // Most relevant - expect(results[0].score).toBeGreaterThan(0.3); - }); - - test('should return results sorted by score', async () => { - const results = await db.search('user management', { topK: 3 }); - - for (let i = 1; i < results.length; i++) { - expect(results[i - 1].score).toBeGreaterThanOrEqual(results[i].score); - } - }); - - test('should respect topK parameter', async () => { - const results = await db.search('user', { topK: 2 }); - - expect(results.length).toBeLessThanOrEqual(2); - }); - - test('should respect threshold parameter', async () => { - const results = await db.search('user', { threshold: 0.8 }); - - results.forEach((result) => { - expect(result.score).toBeGreaterThanOrEqual(0.8); - }); - }); - - test('should filter by metadata', async () => { - const results = await db.search('user', { - filter: (metadata) => metadata.category === 'auth', - }); - - results.forEach((result) => { - expect(result.metadata.category).toBe('auth'); - }); - }); - - test('should handle complex filters', async () => { - const results = await db.search('user', { - filter: (metadata) => metadata.category === 'auth' && metadata.tags?.includes('create') === true, - }); - - expect(results.length).toBeGreaterThan(0); - results.forEach((result) => { - expect(result.metadata.category).toBe('auth'); - expect(result.metadata.tags).toContain('create'); - }); - }); - - test('should return empty array for no matches', async () => { - const results = await db.search('completely unrelated xyz abc', { - threshold: 0.9, - }); - - expect(results).toEqual([]); - }); - - test('should include vector when requested', async () => { - const results = await db.search('user', { - includeVector: true, - topK: 1, - }); - - expect(results[0].vector).toBeDefined(); - expect(results[0].vector).toBeInstanceOf(Float32Array); - }); - - test('should not include vector by default', async () => { - const results = await db.search('user', { topK: 1 }); - - expect(results[0].vector).toBeUndefined(); - }); - }); - - describe('get', () => { - test('should retrieve document by id', async () => { - await db.add('doc-1', 'Test', { - id: 'doc-1', - category: 'test', - }); - - const doc = db.get('doc-1'); - expect(doc).toBeDefined(); - expect(doc?.id).toBe('doc-1'); - expect(doc?.text).toBe('Test'); - }); - - test('should return undefined for non-existent id', () => { - const doc = db.get('non-existent'); - expect(doc).toBeUndefined(); - }); - }); - - describe('has', () => { - test('should return true for existing document', async () => { - await db.add('doc-1', 'Test', { - id: 'doc-1', - category: 'test', - }); - - expect(db.has('doc-1')).toBe(true); - }); - - test('should return false for non-existent document', () => { - expect(db.has('non-existent')).toBe(false); - }); - }); - - describe('remove', () => { - test('should remove a document', async () => { - await db.add('doc-1', 'Test', { - id: 'doc-1', - category: 'test', - }); - - expect(db.has('doc-1')).toBe(true); - - const removed = db.remove('doc-1'); - expect(removed).toBe(true); - expect(db.has('doc-1')).toBe(false); - expect(db.size()).toBe(0); - }); - - test('should return false for non-existent document', () => { - const removed = db.remove('non-existent'); - expect(removed).toBe(false); - }); - }); - - describe('removeMany', () => { - test('should remove multiple documents', async () => { - await db.addMany([ - { - id: 'doc-1', - text: 'Test 1', - metadata: { id: 'doc-1', category: 'test' }, - }, - { - id: 'doc-2', - text: 'Test 2', - metadata: { id: 'doc-2', category: 'test' }, - }, - { - id: 'doc-3', - text: 'Test 3', - metadata: { id: 'doc-3', category: 'test' }, - }, - ]); - - const removed = db.removeMany(['doc-1', 'doc-3']); - expect(removed).toBe(2); - expect(db.size()).toBe(1); - expect(db.has('doc-2')).toBe(true); - }); - - test('should handle non-existent ids', () => { - const removed = db.removeMany(['non-existent-1', 'non-existent-2']); - expect(removed).toBe(0); - }); - }); - - describe('clear', () => { - test('should remove all documents', async () => { - await db.addMany([ - { - id: 'doc-1', - text: 'Test 1', - metadata: { id: 'doc-1', category: 'test' }, - }, - { - id: 'doc-2', - text: 'Test 2', - metadata: { id: 'doc-2', category: 'test' }, - }, - ]); - - expect(db.size()).toBe(2); - - db.clear(); - - expect(db.size()).toBe(0); - expect(db.has('doc-1')).toBe(false); - expect(db.has('doc-2')).toBe(false); - }); - }); - - describe('filter', () => { - beforeEach(async () => { - await db.addMany([ - { - id: 'doc-1', - text: 'Test 1', - metadata: { - id: 'doc-1', - category: 'auth', - author: 'Alice', - }, - }, - { - id: 'doc-2', - text: 'Test 2', - metadata: { - id: 'doc-2', - category: 'auth', - author: 'Bob', - }, - }, - { - id: 'doc-3', - text: 'Test 3', - metadata: { - id: 'doc-3', - category: 'notification', - author: 'Alice', - }, - }, - ]); - }); - - test('should filter by category', () => { - const results = db.filter((metadata) => metadata.category === 'auth'); - - expect(results.length).toBe(2); - results.forEach((doc) => { - expect(doc.metadata.category).toBe('auth'); - }); - }); - - test('should filter by author', () => { - const results = db.filter((metadata) => metadata.author === 'Alice'); - - expect(results.length).toBe(2); - results.forEach((doc) => { - expect(doc.metadata.author).toBe('Alice'); - }); - }); - - test('should handle complex filters', () => { - const results = db.filter((metadata) => metadata.category === 'auth' && metadata.author === 'Alice'); - - expect(results.length).toBe(1); - expect(results[0].id).toBe('doc-1'); - }); - }); - - describe('getStats', () => { - test('should return correct statistics', async () => { - await db.addMany([ - { - id: 'doc-1', - text: 'Test 1', - metadata: { id: 'doc-1', category: 'test' }, - }, - { - id: 'doc-2', - text: 'Test 2', - metadata: { id: 'doc-2', category: 'test' }, - }, - ]); - - const stats = db.getStats(); - - expect(stats.totalEmbeddings).toBe(2); - expect(stats.dimensions).toBe(384); - expect(stats.estimatedMemoryBytes).toBeGreaterThan(0); - expect(stats.modelName).toContain('MiniLM'); - }); - }); - - describe('keys and values', () => { - test('should return all keys', async () => { - await db.addMany([ - { - id: 'doc-1', - text: 'Test 1', - metadata: { id: 'doc-1', category: 'test' }, - }, - { - id: 'doc-2', - text: 'Test 2', - metadata: { id: 'doc-2', category: 'test' }, - }, - ]); - - const keys = db.keys(); - expect(keys).toContain('doc-1'); - expect(keys).toContain('doc-2'); - expect(keys.length).toBe(2); - }); - - test('should return all values', async () => { - await db.addMany([ - { - id: 'doc-1', - text: 'Test 1', - metadata: { id: 'doc-1', category: 'test' }, - }, - { - id: 'doc-2', - text: 'Test 2', - metadata: { id: 'doc-2', category: 'test' }, - }, - ]); - - const values = db.values(); - expect(values.length).toBe(2); - expect(values[0].text).toBeDefined(); - expect(values[0].metadata).toBeDefined(); - }); - }); - - describe('getAll', () => { - test('should return all documents', async () => { - await db.addMany([ - { - id: 'doc-1', - text: 'Test 1', - metadata: { id: 'doc-1', category: 'test' }, - }, - { - id: 'doc-2', - text: 'Test 2', - metadata: { id: 'doc-2', category: 'test' }, - }, - ]); - - const all = db.getAll(); - expect(all.length).toBe(2); - }); - }); - - describe('initialization requirements', () => { - let uninitializedDb: VectoriaDB; - - beforeEach(() => { - uninitializedDb = new VectoriaDB(); - }); - - test('should throw error when adding document before initialization', async () => { - await expect(uninitializedDb.add('doc-1', 'Test', { id: 'doc-1', category: 'test' })).rejects.toThrow( - VectoriaNotInitializedError, - ); - }); - - test('should throw error when adding many documents before initialization', async () => { - await expect( - uninitializedDb.addMany([{ id: 'doc-1', text: 'Test', metadata: { id: 'doc-1', category: 'test' } }]), - ).rejects.toThrow(VectoriaNotInitializedError); - }); - - test('should throw error when searching before initialization', async () => { - await expect(uninitializedDb.search('test query')).rejects.toThrow(VectoriaNotInitializedError); - }); - - test('should throw error when getting stats before initialization', () => { - expect(() => uninitializedDb.getStats()).toThrow(VectoriaNotInitializedError); - }); - - test('isInitialized should return false before initialization', () => { - expect(uninitializedDb.isInitialized()).toBe(false); - }); - - test('isInitialized should return true after initialization', async () => { - await uninitializedDb.initialize(); - expect(uninitializedDb.isInitialized()).toBe(true); - }); - }); - - describe('duplicate ID handling', () => { - test('should throw error when adding duplicate in addMany batch', async () => { - await expect( - db.addMany([ - { id: 'doc-1', text: 'First', metadata: { id: 'doc-1', category: 'test' } }, - { id: 'doc-1', text: 'Duplicate', metadata: { id: 'doc-1', category: 'test' } }, - ]), - ).rejects.toThrow(DuplicateDocumentError); - - // No documents should be added if batch fails - expect(db.size()).toBe(0); - }); - - test('should throw error when addMany contains ID that already exists', async () => { - await db.add('doc-1', 'Existing', { id: 'doc-1', category: 'test' }); - - await expect( - db.addMany([ - { id: 'doc-2', text: 'New', metadata: { id: 'doc-2', category: 'test' } }, - { id: 'doc-1', text: 'Duplicate', metadata: { id: 'doc-1', category: 'test' } }, - ]), - ).rejects.toThrow(DuplicateDocumentError); - - // Original document should remain, new ones should not be added - expect(db.size()).toBe(1); - expect(db.get('doc-2')).toBeUndefined(); - }); - - test('should successfully add after removing duplicate', async () => { - await db.add('doc-1', 'First version', { id: 'doc-1', category: 'test' }); - - db.remove('doc-1'); - - await db.add('doc-1', 'Second version', { id: 'doc-1', category: 'updated' }); - - const doc = db.get('doc-1'); - expect(doc?.text).toBe('Second version'); - expect(doc?.metadata.category).toBe('updated'); - }); - }); - - describe('configuration edge cases', () => { - test('should handle 0 as valid dimension value', () => { - const customDb = new VectoriaDB({ dimensions: 0 }); - // Dimensions should be 0, not fall back to default 384 - expect((customDb as any).config.dimensions).toBe(0); - }); - - test('should handle custom cache directory', () => { - const customDb = new VectoriaDB({ cacheDir: '/custom/path' }); - expect((customDb as any).config.cacheDir).toBe('/custom/path'); - }); - - test('should handle 0 as valid topK value', () => { - const customDb = new VectoriaDB({ defaultTopK: 0 }); - expect((customDb as any).config.defaultTopK).toBe(0); - }); - - test('should handle 0 as valid similarity threshold', () => { - const customDb = new VectoriaDB({ defaultSimilarityThreshold: 0 }); - expect((customDb as any).config.defaultSimilarityThreshold).toBe(0); - }); - - test('should use defaults when config values are null', () => { - const customDb = new VectoriaDB({ - modelName: null as any, - cacheDir: null as any, - dimensions: null as any, - }); - expect((customDb as any).config.modelName).toBe('Xenova/all-MiniLM-L6-v2'); - expect((customDb as any).config.cacheDir).toBe('./.cache/transformers'); - expect((customDb as any).config.dimensions).toBe(384); - }); - - test('should use defaults when config values are undefined', () => { - const customDb = new VectoriaDB({ - modelName: undefined, - cacheDir: undefined, - dimensions: undefined, - }); - expect((customDb as any).config.modelName).toBe('Xenova/all-MiniLM-L6-v2'); - expect((customDb as any).config.cacheDir).toBe('./.cache/transformers'); - expect((customDb as any).config.dimensions).toBe(384); - }); - }); - - describe('input validation', () => { - describe('add validation', () => { - test('should throw error for empty text', async () => { - await expect(db.add('doc-1', '', { id: 'doc-1', category: 'test' })).rejects.toThrow(DocumentValidationError); - }); - - test('should throw error for whitespace-only text', async () => { - await expect(db.add('doc-1', ' ', { id: 'doc-1', category: 'test' })).rejects.toThrow( - DocumentValidationError, - ); - }); - }); - - describe('addMany validation', () => { - test('should throw error for document with empty text', async () => { - await expect( - db.addMany([ - { id: 'doc-1', text: 'Valid text', metadata: { id: 'doc-1', category: 'test' } }, - { id: 'doc-2', text: '', metadata: { id: 'doc-2', category: 'test' } }, - ]), - ).rejects.toThrow(DocumentValidationError); - - // No documents should be added if validation fails - expect(db.size()).toBe(0); - }); - - test('should throw error for document with whitespace-only text', async () => { - await expect( - db.addMany([{ id: 'doc-1', text: ' \n\t ', metadata: { id: 'doc-1', category: 'test' } }]), - ).rejects.toThrow(DocumentValidationError); - }); - }); - - describe('search validation', () => { - test('should throw error for empty query', async () => { - await expect(db.search('')).rejects.toThrow(QueryValidationError); - }); - - test('should throw error for whitespace-only query', async () => { - await expect(db.search(' \n\t ')).rejects.toThrow(QueryValidationError); - }); - - test('should throw error for negative topK', async () => { - await expect(db.search('test', { topK: -1 })).rejects.toThrow(QueryValidationError); - }); - - test('should throw error for zero topK', async () => { - await expect(db.search('test', { topK: 0 })).rejects.toThrow(QueryValidationError); - }); - - test('should throw error for threshold below 0', async () => { - await expect(db.search('test', { threshold: -0.5 })).rejects.toThrow(QueryValidationError); - }); - - test('should throw error for threshold above 1', async () => { - await expect(db.search('test', { threshold: 1.5 })).rejects.toThrow(QueryValidationError); - }); - - test('should accept threshold of 0', async () => { - await db.add('doc-1', 'test document', { id: 'doc-1', category: 'test' }); - const results = await db.search('test', { threshold: 0 }); - expect(results).toBeDefined(); - }); - - test('should accept threshold of 1', async () => { - await db.add('doc-1', 'test document', { id: 'doc-1', category: 'test' }); - const results = await db.search('test', { threshold: 1 }); - expect(results).toBeDefined(); - }); - }); - }); - - describe('HNSW integration', () => { - let hnswDb: VectoriaDB; - - beforeAll(async () => { - hnswDb = new VectoriaDB({ - useHNSW: true, - hnsw: { - M: 16, - efConstruction: 200, - efSearch: 50, - }, - }); - await hnswDb.initialize(); - }, 60000); - - afterEach(() => { - hnswDb.clear(); - }); - - describe('configuration', () => { - test('should create database with HNSW enabled', () => { - expect(hnswDb.isInitialized()).toBe(true); - }); - - test('should accept custom HNSW parameters', () => { - const customDb = new VectoriaDB({ - useHNSW: true, - hnsw: { - M: 32, - M0: 64, - efConstruction: 400, - efSearch: 100, - }, - }); - expect(customDb).toBeDefined(); - }); - }); - - describe('add with HNSW', () => { - test('should add document to HNSW index', async () => { - await hnswDb.add('doc-1', 'Machine learning is fascinating', { - id: 'doc-1', - category: 'tech', - }); - - expect(hnswDb.size()).toBe(1); - expect(hnswDb.has('doc-1')).toBe(true); - }); - - test('should add multiple documents to HNSW index', async () => { - const docs = [ - { id: 'doc-1', text: 'Machine learning basics', metadata: { id: 'doc-1', category: 'tech' } }, - { id: 'doc-2', text: 'Deep learning tutorial', metadata: { id: 'doc-2', category: 'tech' } }, - { id: 'doc-3', text: 'Cooking recipes', metadata: { id: 'doc-3', category: 'food' } }, - ]; - - await hnswDb.addMany(docs); - expect(hnswDb.size()).toBe(3); - }); - }); - - describe('search with HNSW', () => { - beforeEach(async () => { - await hnswDb.addMany([ - { id: 'doc-1', text: 'Machine learning and AI', metadata: { id: 'doc-1', category: 'tech' } }, - { id: 'doc-2', text: 'Deep learning neural networks', metadata: { id: 'doc-2', category: 'tech' } }, - { id: 'doc-3', text: 'Cooking Italian pasta', metadata: { id: 'doc-3', category: 'food' } }, - { id: 'doc-4', text: 'Python programming language', metadata: { id: 'doc-4', category: 'tech' } }, - { id: 'doc-5', text: 'Baking chocolate cake', metadata: { id: 'doc-5', category: 'food' } }, - ]); - }); - - test('should find relevant documents using HNSW', async () => { - const results = await hnswDb.search('artificial intelligence'); - - expect(results.length).toBeGreaterThan(0); - expect(results[0].score).toBeGreaterThan(0); - // Tech documents should be more relevant - expect(['doc-1', 'doc-2', 'doc-4']).toContain(results[0].id); - }); - - test('should return top-k results with HNSW', async () => { - const results = await hnswDb.search('technology', { topK: 2 }); - - expect(results.length).toBeLessThanOrEqual(2); - }); - - test('should filter results with HNSW', async () => { - const results = await hnswDb.search('food', { - filter: (metadata) => metadata.category === 'food', - }); - - expect(results.length).toBeGreaterThan(0); - results.forEach((result) => { - expect(result.metadata.category).toBe('food'); - }); - }); - - test('should respect similarity threshold with HNSW', async () => { - const results = await hnswDb.search('cooking', { threshold: 0.7 }); - - results.forEach((result) => { - expect(result.score).toBeGreaterThanOrEqual(0.7); - }); - }); - - test('should handle searches on large datasets', async () => { - // Add more documents to test HNSW performance characteristics - const largeBatch: Array<{ id: string; text: string; metadata: TestMetadata }> = []; - for (let i = 0; i < 50; i++) { - largeBatch.push({ - id: `doc-${i + 10}`, - text: `Document about topic ${i % 10}`, - metadata: { id: `doc-${i + 10}`, category: `category-${i % 5}` }, - }); - } - await hnswDb.addMany(largeBatch); - - const results = await hnswDb.search('topic'); - expect(results.length).toBeGreaterThan(0); - }); - }); - - describe('remove with HNSW', () => { - beforeEach(async () => { - await hnswDb.addMany([ - { id: 'doc-1', text: 'First document', metadata: { id: 'doc-1', category: 'test' } }, - { id: 'doc-2', text: 'Second document', metadata: { id: 'doc-2', category: 'test' } }, - { id: 'doc-3', text: 'Third document', metadata: { id: 'doc-3', category: 'test' } }, - ]); - }); - - test('should remove document from HNSW index', () => { - const removed = hnswDb.remove('doc-2'); - expect(removed).toBe(true); - expect(hnswDb.size()).toBe(2); - expect(hnswDb.has('doc-2')).toBe(false); - }); - - test('should update search results after removal', async () => { - hnswDb.remove('doc-1'); - - const results = await hnswDb.search('document'); - const ids = results.map((r) => r.id); - expect(ids).not.toContain('doc-1'); - }); - - test('should handle removing multiple documents', () => { - const removed = hnswDb.removeMany(['doc-1', 'doc-3']); - expect(removed).toBe(2); - expect(hnswDb.size()).toBe(1); - }); - }); - - describe('clear with HNSW', () => { - test('should clear all documents from HNSW index', async () => { - await hnswDb.addMany([ - { id: 'doc-1', text: 'First', metadata: { id: 'doc-1', category: 'test' } }, - { id: 'doc-2', text: 'Second', metadata: { id: 'doc-2', category: 'test' } }, - ]); - - hnswDb.clear(); - expect(hnswDb.size()).toBe(0); - - const results = await hnswDb.search('test'); - expect(results.length).toBe(0); - }); - }); - - describe('HNSW vs brute-force comparison', () => { - test('should produce similar results to brute-force search', async () => { - // Use same seed data for both - const docs = [ - { id: 'doc-1', text: 'Machine learning algorithms', metadata: { id: 'doc-1', category: 'tech' } }, - { id: 'doc-2', text: 'Neural network architecture', metadata: { id: 'doc-2', category: 'tech' } }, - { id: 'doc-3', text: 'Cooking Italian cuisine', metadata: { id: 'doc-3', category: 'food' } }, - { id: 'doc-4', text: 'Python data science', metadata: { id: 'doc-4', category: 'tech' } }, - ]; - - // HNSW results - await hnswDb.addMany(docs); - const hnswResults = await hnswDb.search('machine learning', { topK: 2 }); - - // Brute-force results - const bruteDb = new VectoriaDB(); - await bruteDb.initialize(); - await bruteDb.addMany(docs); - const bruteResults = await bruteDb.search('machine learning', { topK: 2 }); - - // Top result should be the same (or very similar) - expect(hnswResults[0].id).toBe(bruteResults[0].id); - expect(hnswResults.length).toBe(bruteResults.length); - }); - }); - - describe('edge cases with HNSW', () => { - test('should handle single document', async () => { - await hnswDb.add('doc-1', 'Only document', { id: 'doc-1', category: 'test' }); - - const results = await hnswDb.search('document'); - expect(results.length).toBe(1); - expect(results[0].id).toBe('doc-1'); - }); - - test('should handle duplicate IDs with HNSW', async () => { - await hnswDb.add('doc-1', 'First version', { id: 'doc-1', category: 'test' }); - - await expect(hnswDb.add('doc-1', 'Second version', { id: 'doc-1', category: 'test' })).rejects.toThrow( - DocumentExistsError, - ); - }); - - test('should require initialization before operations', async () => { - const uninitDb = new VectoriaDB({ useHNSW: true }); - - await expect(uninitDb.add('doc-1', 'Test', { id: 'doc-1', category: 'test' })).rejects.toThrow( - VectoriaNotInitializedError, - ); - }); - }); - }); - - describe('Incremental Updates', () => { - beforeEach(async () => { - await db.addMany([ - { id: 'doc-1', text: 'Machine learning basics', metadata: { id: 'doc-1', category: 'tech', author: 'Alice' } }, - { id: 'doc-2', text: 'Cooking pasta recipes', metadata: { id: 'doc-2', category: 'food', author: 'Bob' } }, - { id: 'doc-3', text: 'Python programming', metadata: { id: 'doc-3', category: 'tech', author: 'Charlie' } }, - ]); - }); - - describe('updateMetadata', () => { - test('should update metadata only without re-embedding', () => { - const originalDoc = db.get('doc-1')!; - const originalVector = originalDoc.vector; - const originalText = originalDoc.text; - const originalCreatedAt = originalDoc.createdAt; - - db.updateMetadata('doc-1', { id: 'doc-1', category: 'ai', author: 'Alice Updated' }); - - const updated = db.get('doc-1')!; - expect(updated.metadata.category).toBe('ai'); - expect(updated.metadata.author).toBe('Alice Updated'); - expect(updated.text).toBe(originalText); - expect(updated.vector).toBe(originalVector); // Same reference = not re-embedded - expect(updated.createdAt).toBe(originalCreatedAt); - }); - - test('should throw error for non-existent document', () => { - expect(() => { - db.updateMetadata('non-existent', { id: 'non-existent', category: 'test' }); - }).toThrow(DocumentNotFoundError); - }); - - test('should throw error if not initialized', () => { - const uninitDb = new VectoriaDB(); - expect(() => { - uninitDb.updateMetadata('doc-1', { id: 'doc-1', category: 'test' }); - }).toThrow('VectoriaDB must be initialized'); - }); - }); - - describe('update', () => { - test('should update metadata only when text not changed', async () => { - const originalVector = db.get('doc-1')!.vector; - - const reembedded = await db.update('doc-1', { - metadata: { id: 'doc-1', category: 'ai', author: 'Alice' }, - }); - - expect(reembedded).toBe(false); // Not re-embedded - const updated = db.get('doc-1')!; - expect(updated.metadata.category).toBe('ai'); - expect(updated.vector).toBe(originalVector); - }); - - test('should re-embed when text changes', async () => { - const originalVector = db.get('doc-1')!.vector; - - const reembedded = await db.update('doc-1', { - text: 'Deep learning advanced concepts', - metadata: { id: 'doc-1', category: 'ai', author: 'Alice' }, - }); - - expect(reembedded).toBe(true); // Re-embedded - const updated = db.get('doc-1')!; - expect(updated.text).toBe('Deep learning advanced concepts'); - expect(updated.metadata.category).toBe('ai'); - expect(updated.vector).not.toBe(originalVector); // Different reference - }); - - test('should not re-embed when text is same', async () => { - const originalVector = db.get('doc-1')!.vector; - - const reembedded = await db.update('doc-1', { - text: 'Machine learning basics', // Same text - metadata: { id: 'doc-1', category: 'ai', author: 'Alice' }, - }); - - expect(reembedded).toBe(false); // Not re-embedded - const updated = db.get('doc-1')!; - expect(updated.vector).toBe(originalVector); - expect(updated.metadata.category).toBe('ai'); - }); - - test('should force re-embed when forceReembed is true', async () => { - const originalVector = db.get('doc-1')!.vector; - - const reembedded = await db.update( - 'doc-1', - { - text: 'Machine learning basics', // Same text - }, - { forceReembed: true }, - ); - - expect(reembedded).toBe(true); // Forced re-embedding - const updated = db.get('doc-1')!; - expect(updated.vector).not.toBe(originalVector); - }); - - test('should throw error for empty text', async () => { - await expect(db.update('doc-1', { text: '' })).rejects.toThrow('Document text cannot be empty'); - }); - - test('should throw error for whitespace-only text', async () => { - await expect(db.update('doc-1', { text: ' \n\t ' })).rejects.toThrow('Document text cannot be empty'); - }); - - test('should throw error for non-existent document', async () => { - await expect(db.update('non-existent', { text: 'New text' })).rejects.toThrow( - 'Document with id "non-existent" not found', - ); - }); - - test('should update text only without metadata', async () => { - const originalCategory = db.get('doc-1')!.metadata.category; - - await db.update('doc-1', { text: 'New text content' }); - - const updated = db.get('doc-1')!; - expect(updated.text).toBe('New text content'); - expect(updated.metadata.category).toBe(originalCategory); // Unchanged - }); - }); - - describe('updateMany', () => { - test('should update multiple documents efficiently', async () => { - const result = await db.updateMany([ - { id: 'doc-1', metadata: { id: 'doc-1', category: 'ai', author: 'Alice' } }, - { id: 'doc-2', metadata: { id: 'doc-2', category: 'recipes', author: 'Bob' } }, - ]); - - expect(result.updated).toBe(2); - expect(result.reembedded).toBe(0); // Only metadata changed - - expect(db.get('doc-1')!.metadata.category).toBe('ai'); - expect(db.get('doc-2')!.metadata.category).toBe('recipes'); - }); - - test('should batch re-embed only documents with text changes', async () => { - const result = await db.updateMany([ - { id: 'doc-1', text: 'Updated machine learning', metadata: { id: 'doc-1', category: 'ai' } }, - { id: 'doc-2', metadata: { id: 'doc-2', category: 'recipes' } }, // No text change - { id: 'doc-3', text: 'Updated Python guide', metadata: { id: 'doc-3', category: 'programming' } }, - ]); - - expect(result.updated).toBe(3); - expect(result.reembedded).toBe(2); // Only doc-1 and doc-3 - - expect(db.get('doc-1')!.text).toBe('Updated machine learning'); - expect(db.get('doc-2')!.text).toBe('Cooking pasta recipes'); // Unchanged - expect(db.get('doc-3')!.text).toBe('Updated Python guide'); - }); - - test('should not re-embed when text is same', async () => { - const result = await db.updateMany([ - { id: 'doc-1', text: 'Machine learning basics', metadata: { id: 'doc-1', category: 'ai' } }, // Same text - ]); - - expect(result.updated).toBe(1); - expect(result.reembedded).toBe(0); - }); - - test('should force re-embed all when forceReembed is true', async () => { - const result = await db.updateMany( - [ - { id: 'doc-1', text: 'Machine learning basics', metadata: { id: 'doc-1', category: 'ai' } }, - { id: 'doc-2', text: 'Cooking pasta recipes', metadata: { id: 'doc-2', category: 'food' } }, - ], - { forceReembed: true }, - ); - - expect(result.updated).toBe(2); - expect(result.reembedded).toBe(2); // Both forced to re-embed - }); - - test('should throw error if any document not found', async () => { - await expect( - db.updateMany([ - { id: 'doc-1', metadata: { id: 'doc-1', category: 'test' } }, - { id: 'non-existent', metadata: { id: 'non-existent', category: 'test' } }, - ]), - ).rejects.toThrow('Document with id "non-existent" not found'); - }); - - test('should throw error for empty text in batch', async () => { - await expect( - db.updateMany([ - { id: 'doc-1', text: 'Valid text', metadata: { id: 'doc-1', category: 'test' } }, - { id: 'doc-2', text: '', metadata: { id: 'doc-2', category: 'test' } }, - ]), - ).rejects.toThrow('Document with id "doc-2" has empty or whitespace-only text'); - }); - - test('should work with HNSW index', async () => { - const hnswDb = new VectoriaDB({ useHNSW: true }); - await hnswDb.initialize(); - - await hnswDb.addMany([ - { id: 'doc-1', text: 'Machine learning', metadata: { id: 'doc-1', category: 'tech' } }, - { id: 'doc-2', text: 'Cooking', metadata: { id: 'doc-2', category: 'food' } }, - ]); - - await hnswDb.updateMany([{ id: 'doc-1', text: 'Deep learning AI', metadata: { id: 'doc-1', category: 'ai' } }]); - - const results = await hnswDb.search('artificial intelligence', { threshold: 0 }); - expect(results.length).toBeGreaterThan(0); - }); - }); - - describe('update performance', () => { - test('updateMetadata should be instant (no embedding generation)', () => { - const start = Date.now(); - db.updateMetadata('doc-1', { id: 'doc-1', category: 'updated' }); - const duration = Date.now() - start; - - expect(duration).toBeLessThan(10); // Should be < 10ms - }); - - test('should update metadata on many documents quickly', async () => { - // Add many documents first - const docs: Array<{ id: string; text: string; metadata: TestMetadata }> = []; - for (let i = 0; i < 100; i++) { - docs.push({ id: `perf-${i}`, text: `Document ${i}`, metadata: { id: `perf-${i}`, category: 'test' } }); - } - await db.addMany(docs); - - // Metadata-only updates should be fast - for (let i = 0; i < 100; i++) { - db.updateMetadata(`perf-${i}`, { id: `perf-${i}`, category: 'updated' }); - } - - expect(db.get('perf-50')!.metadata.category).toBe('updated'); - }); - }); - }); -}); diff --git a/libs/vectoriadb/src/embedding.service.ts b/libs/vectoriadb/src/embedding.service.ts deleted file mode 100644 index 0850ff9..0000000 --- a/libs/vectoriadb/src/embedding.service.ts +++ /dev/null @@ -1,197 +0,0 @@ -import { EmbeddingError, ConfigurationError } from './errors'; - -/** - * Service for generating embeddings using transformers.js - * - * NOTE: This service requires @huggingface/transformers to be installed. - * Install it with: npm install @huggingface/transformers - * - * For a zero-dependency alternative, use TFIDFEmbeddingService instead. - */ -export class EmbeddingService { - // Static transformers module for dependency injection (used in testing) - private static _transformersModule: any = null; - - /** - * Inject a transformers module (for testing purposes) - * @internal - */ - static setTransformersModule(module: any): void { - EmbeddingService._transformersModule = module; - } - - /** - * Clear the injected transformers module - * @internal - */ - static clearTransformersModule(): void { - EmbeddingService._transformersModule = null; - } - - // Using 'any' because @huggingface/transformers is an optional dependency - private pipeline: any = null; - private modelName: string; - private cacheDir: string; - private dimensions = 384; // default for all-MiniLM-L6-v2 - private isInitialized = false; - private initializationPromise: Promise | null = null; - - constructor(modelName = 'Xenova/all-MiniLM-L6-v2', cacheDir = './.cache/transformers') { - this.modelName = modelName; - this.cacheDir = cacheDir; - } - - /** - * Dynamically import @huggingface/transformers - * This allows the package to be optional - only loaded when actually used - */ - private async loadTransformers(): Promise { - // Use injected module if available (for testing) - if (EmbeddingService._transformersModule) { - return EmbeddingService._transformersModule.pipeline; - } - - try { - // Dynamic import - package may not be installed - // Using Function() to bypass TypeScript's static analysis for optional dependency - const transformers = await (Function('return import("@huggingface/transformers")')() as Promise); - return transformers.pipeline; - } catch (_error) { - throw new ConfigurationError( - '@huggingface/transformers is not installed. ' + - 'Install it with: npm install @huggingface/transformers\n' + - 'Or use TFIDFVectoria/TFIDFEmbeddingService for a zero-dependency alternative.', - ); - } - } - - /** - * Initialize the embedding model - */ - async initialize(): Promise { - // Prevent multiple initializations - if (this.isInitialized) { - return; - } - - if (this.initializationPromise) { - return this.initializationPromise; - } - - this.initializationPromise = this._initialize(); - return this.initializationPromise; - } - - private async _initialize(): Promise { - try { - // Dynamically load transformers - const pipelineFn = await this.loadTransformers(); - - // Create feature extraction pipeline - this.pipeline = await pipelineFn('feature-extraction', this.modelName, { - // Use local models directory to cache models - cache_dir: this.cacheDir, - // // Don't require progress bars in production - // progress_callback: null, - }); - - // Test the pipeline to get dimensions - const testEmbedding = await this.pipeline('test', { - pooling: 'mean', - normalize: true, - }); - - this.dimensions = testEmbedding.data.length; - this.isInitialized = true; - } catch (error) { - this.initializationPromise = null; - if (error instanceof ConfigurationError) { - throw error; - } - throw new EmbeddingError( - `Failed to initialize embedding model: ${error instanceof Error ? error.message : String(error)}`, - error instanceof Error ? error : undefined, - ); - } - } - - /** - * Generate embedding for a single text - */ - async generateEmbedding(text: string): Promise { - if (!this.isInitialized) { - await this.initialize(); - } - - try { - const output = await this.pipeline(text, { - pooling: 'mean', - normalize: true, - }); - - return new Float32Array(output.data); - } catch (error) { - throw new EmbeddingError( - `Failed to generate embedding: ${error instanceof Error ? error.message : String(error)}`, - error instanceof Error ? error : undefined, - ); - } - } - - /** - * Generate embeddings for multiple texts in batch - */ - async generateEmbeddings(texts: string[]): Promise { - if (!this.isInitialized) { - await this.initialize(); - } - - try { - // Process in batches to avoid memory issues - const batchSize = 32; - const results: Float32Array[] = []; - - for (let i = 0; i < texts.length; i += batchSize) { - const batch = texts.slice(i, i + batchSize); - const outputs = await Promise.all( - batch.map((text) => - this.pipeline(text, { - pooling: 'mean', - normalize: true, - }), - ), - ); - - results.push(...outputs.map((output) => new Float32Array(output.data))); - } - - return results; - } catch (error) { - throw new EmbeddingError( - `Failed to generate embeddings: ${error instanceof Error ? error.message : String(error)}`, - error instanceof Error ? error : undefined, - ); - } - } - - /** - * Get the vector dimensions - */ - getDimensions(): number { - return this.dimensions; - } - - /** - * Get the model name - */ - getModelName(): string { - return this.modelName; - } - - /** - * Check if the service is initialized - */ - isReady(): boolean { - return this.isInitialized; - } -} diff --git a/libs/vectoriadb/src/error.utils.ts b/libs/vectoriadb/src/error.utils.ts deleted file mode 100644 index 111e72a..0000000 --- a/libs/vectoriadb/src/error.utils.ts +++ /dev/null @@ -1,89 +0,0 @@ -/** - * Error message sanitization utilities - * Prevents information disclosure in production environments - */ - -/** - * Sanitizes an error message for production use - * Removes potentially sensitive information like file paths, IDs, etc. - * - * @param error - The error object or message - * @param genericMessage - The generic message to use in production - * @param verboseErrors - Whether to return verbose error messages - * @returns Sanitized error message - */ -export function sanitizeErrorMessage(error: Error | string, genericMessage: string, verboseErrors = true): string { - if (verboseErrors) { - return typeof error === 'string' ? error : error.message; - } - - // In production mode, return generic message - return genericMessage; -} - -/** - * Sanitizes document ID from error messages - * Replaces actual IDs with generic placeholder in production - * - * @param id - The document ID - * @param verboseErrors - Whether to include the actual ID - * @returns Sanitized ID string - */ -export function sanitizeDocumentId(id: string, verboseErrors = true): string { - if (verboseErrors) { - return id; - } - return '[document]'; -} - -/** - * Creates a sanitized error message for file operations - * Removes file paths and other sensitive details in production - * - * @param operation - The operation being performed (e.g., 'read', 'write') - * @param error - The original error - * @param verboseErrors - Whether to include detailed error information - * @returns Sanitized error message - */ -export function sanitizeFileError(operation: string, error: Error | string, verboseErrors = true): string { - if (verboseErrors) { - const message = typeof error === 'string' ? error : error.message; - return `Failed to ${operation} file: ${message}`; - } - - return `Failed to ${operation} file`; -} - -/** - * Creates a sanitized error message for storage operations - * Removes connection strings and other sensitive details in production - * - * @param operation - The operation being performed - * @param error - The original error - * @param verboseErrors - Whether to include detailed error information - * @returns Sanitized error message - */ -export function sanitizeStorageError(operation: string, error: Error | string, verboseErrors = true): string { - if (verboseErrors) { - const message = typeof error === 'string' ? error : error.message; - return `Storage ${operation} failed: ${message}`; - } - - return `Storage operation failed`; -} - -/** - * Generic error messages for common operations - * Used in production mode to prevent information disclosure - */ -export const GENERIC_ERROR_MESSAGES = { - VALIDATION_ERROR: 'Validation failed', - DOCUMENT_NOT_FOUND: 'Document not found', - DOCUMENT_EXISTS: 'Document already exists', - DUPLICATE_DOCUMENT: 'Duplicate document detected', - QUERY_ERROR: 'Query validation failed', - EMBEDDING_ERROR: 'Embedding generation failed', - STORAGE_ERROR: 'Storage operation failed', - NOT_INITIALIZED: 'Database not initialized', - CONFIGURATION_ERROR: 'Invalid configuration', -} as const; diff --git a/libs/vectoriadb/src/errors.ts b/libs/vectoriadb/src/errors.ts deleted file mode 100644 index 4ed765f..0000000 --- a/libs/vectoriadb/src/errors.ts +++ /dev/null @@ -1,118 +0,0 @@ -/** - * VectoriaDB Error Classes - * Production-ready error handling with specific error types - */ - -/** - * Base error class for all VectoriaDB errors - */ -export class VectoriaError extends Error { - constructor( - message: string, - public readonly code: string, - ) { - super(message); - this.name = this.constructor.name; - Error.captureStackTrace(this, this.constructor); - } -} - -/** - * Thrown when attempting operations before VectoriaDB is initialized - */ -export class VectoriaNotInitializedError extends VectoriaError { - constructor(operation: string) { - super(`VectoriaDB must be initialized before ${operation}. Call initialize() first.`, 'NOT_INITIALIZED'); - } -} - -/** - * Thrown when document validation fails - */ -export class DocumentValidationError extends VectoriaError { - constructor( - message: string, - public readonly documentId?: string, - ) { - super(message, 'DOCUMENT_VALIDATION_ERROR'); - } -} - -/** - * Thrown when a document with the given ID is not found - */ -export class DocumentNotFoundError extends VectoriaError { - constructor(public readonly documentId: string) { - super(`Document with id "${documentId}" not found`, 'DOCUMENT_NOT_FOUND'); - } -} - -/** - * Thrown when attempting to add a document that already exists - */ -export class DocumentExistsError extends VectoriaError { - constructor(public readonly documentId: string) { - super( - `Document with id "${documentId}" already exists. Use remove() first or choose a different id.`, - 'DOCUMENT_EXISTS', - ); - } -} - -/** - * Thrown when a duplicate document ID is found in a batch operation - */ -export class DuplicateDocumentError extends VectoriaError { - constructor( - public readonly documentId: string, - public readonly context: 'batch' | 'existing', - ) { - const message = - context === 'batch' - ? `Duplicate document id "${documentId}" in batch` - : `Document with id "${documentId}" already exists`; - super(message, 'DUPLICATE_DOCUMENT'); - } -} - -/** - * Thrown when search query validation fails - */ -export class QueryValidationError extends VectoriaError { - constructor(message: string) { - super(message, 'QUERY_VALIDATION_ERROR'); - } -} - -/** - * Thrown when embedding generation fails or produces unexpected results - */ -export class EmbeddingError extends VectoriaError { - constructor( - message: string, - public readonly details?: any, - ) { - super(message, 'EMBEDDING_ERROR'); - } -} - -/** - * Thrown when storage operations fail - */ -export class StorageError extends VectoriaError { - constructor( - message: string, - public readonly originalError?: Error, - ) { - super(message, 'STORAGE_ERROR'); - } -} - -/** - * Thrown when configuration is invalid - */ -export class ConfigurationError extends VectoriaError { - constructor(message: string) { - super(message, 'CONFIGURATION_ERROR'); - } -} diff --git a/libs/vectoriadb/src/hnsw.index.ts b/libs/vectoriadb/src/hnsw.index.ts deleted file mode 100644 index a1b4599..0000000 --- a/libs/vectoriadb/src/hnsw.index.ts +++ /dev/null @@ -1,401 +0,0 @@ -import { cosineSimilarity } from './similarity.utils'; - -/** - * HNSW (Hierarchical Navigable Small World) Index - * Provides efficient approximate nearest neighbor search - */ - -/** - * Configuration for HNSW index - */ -export interface HNSWConfig { - /** - * Maximum number of connections per node in layer > 0 - * @default 16 - */ - M?: number; - - /** - * Maximum connections for layer 0 (typically M * 2) - * @default 32 - */ - M0?: number; - - /** - * Size of dynamic candidate list during construction - * Higher = better quality, slower construction - * @default 200 - */ - efConstruction?: number; - - /** - * Size of dynamic candidate list during search - * Higher = better recall, slower search - * @default 50 - */ - efSearch?: number; - - /** - * Normalization factor for level assignment - * @default 1 / Math.log(2) - */ - levelMultiplier?: number; -} - -/** - * Node in the HNSW graph - */ -interface HNSWNode { - id: string; - vector: Float32Array; - level: number; - connections: Map>; // layer -> set of neighbor IDs -} - -/** - * Candidate for nearest neighbor search - */ -interface Candidate { - id: string; - distance: number; -} - -/** - * HNSW Index implementation - */ -export class HNSWIndex { - private nodes: Map; - private entryPointId: string | null; - private config: Required; - private maxLevel: number; - - constructor(config: HNSWConfig = {}) { - this.nodes = new Map(); - this.entryPointId = null; - this.maxLevel = -1; - - this.config = { - M: config.M ?? 16, - M0: config.M0 ?? 32, - efConstruction: config.efConstruction ?? 200, - efSearch: config.efSearch ?? 50, - levelMultiplier: config.levelMultiplier ?? 1 / Math.log(2), - }; - } - - /** - * Insert a vector into the HNSW index - */ - insert(id: string, vector: Float32Array): void { - // Assign random level to new node (exponential decay) - const level = this.assignLevel(); - - // Create new node - const newNode: HNSWNode = { - id, - vector, - level, - connections: new Map(), - }; - - // Initialize connection sets for each layer - for (let lc = 0; lc <= level; lc++) { - newNode.connections.set(lc, new Set()); - } - - this.nodes.set(id, newNode); - - // If this is the first node, make it the entry point - if (this.entryPointId === null) { - this.entryPointId = id; - this.maxLevel = level; - return; - } - - // Find nearest neighbors at each level and connect - let currentNearest = this.entryPointId; - - // Search from top level down to level + 1 - for (let lc = this.maxLevel; lc > level; lc--) { - const nearest = this.searchLayer(vector, currentNearest, 1, lc); - if (nearest.length > 0) { - currentNearest = nearest[0].id; - } - } - - // Search and insert from level down to 0 - for (let lc = level; lc >= 0; lc--) { - const candidates = this.searchLayer(vector, currentNearest, this.config.efConstruction, lc); - - // Select M neighbors - const M = lc === 0 ? this.config.M0 : this.config.M; - const neighbors = this.selectNeighbors(candidates, M); - - // Add bidirectional links - for (const neighbor of neighbors) { - const neighborNode = this.nodes.get(neighbor.id); - if (!neighborNode) { - continue; - } - - // Only add connections if both nodes exist at this layer - this.addConnection(id, neighbor.id, lc); - - // Only add reverse connection if neighbor exists at this layer - if (neighborNode.level >= lc) { - this.addConnection(neighbor.id, id, lc); - - // Prune connections if needed - const neighborConnections = neighborNode.connections.get(lc); - if (neighborConnections && neighborConnections.size > M) { - this.pruneConnections(neighbor.id, lc, M); - } - } - } - - if (candidates.length > 0) { - currentNearest = candidates[0].id; - } - } - - // Update entry point if new node has higher level - if (level > this.maxLevel) { - this.maxLevel = level; - this.entryPointId = id; - } - } - - /** - * Search for k nearest neighbors - */ - search(query: Float32Array, k: number, ef?: number): Candidate[] { - if (this.entryPointId === null) { - return []; - } - - const efSearch = ef ?? this.config.efSearch; - let currentNearest = this.entryPointId; - - // Search from top level down to level 1 - for (let lc = this.maxLevel; lc > 0; lc--) { - const nearest = this.searchLayer(query, currentNearest, 1, lc); - if (nearest.length > 0) { - currentNearest = nearest[0].id; - } - } - - // Search at layer 0 with efSearch - const candidates = this.searchLayer(query, currentNearest, efSearch, 0); - - // Return top k - return candidates.slice(0, k); - } - - /** - * Remove a node from the index - */ - remove(id: string): boolean { - const node = this.nodes.get(id); - if (!node) { - return false; - } - - // Remove all connections to this node - for (let lc = 0; lc <= node.level; lc++) { - const connections = node.connections.get(lc)!; - for (const neighborId of connections) { - const neighbor = this.nodes.get(neighborId); - if (neighbor) { - neighbor.connections.get(lc)?.delete(id); - } - } - } - - // Remove the node - this.nodes.delete(id); - - // Update entry point if needed - if (this.entryPointId === id) { - // Find new entry point (node with highest level) - let newEntryPoint: string | null = null; - let newMaxLevel = -1; - - for (const [nodeId, node] of this.nodes) { - if (node.level > newMaxLevel) { - newMaxLevel = node.level; - newEntryPoint = nodeId; - } - } - - this.entryPointId = newEntryPoint; - this.maxLevel = newMaxLevel; - } - - return true; - } - - /** - * Get the number of nodes in the index - */ - size(): number { - return this.nodes.size; - } - - /** - * Clear all nodes from the index - */ - clear(): void { - this.nodes.clear(); - this.entryPointId = null; - this.maxLevel = -1; - } - - /** - * Search within a single layer - */ - private searchLayer(query: Float32Array, entryPoint: string, ef: number, layer: number): Candidate[] { - const visited = new Set(); - const candidates: Candidate[] = []; - const results: Candidate[] = []; - - const entryNode = this.nodes.get(entryPoint); - if (!entryNode) { - return []; - } - - const entryDistance = this.distance(query, entryNode.vector); - candidates.push({ id: entryPoint, distance: entryDistance }); - results.push({ id: entryPoint, distance: entryDistance }); - visited.add(entryPoint); - - while (candidates.length > 0) { - // Get closest candidate - candidates.sort((a, b) => a.distance - b.distance); - const current = candidates.shift()!; - - // If current is farther than furthest result, stop - if (results.length >= ef) { - results.sort((a, b) => a.distance - b.distance); - if (current.distance > results[ef - 1].distance) { - break; - } - } - - // Explore neighbors - const currentNode = this.nodes.get(current.id)!; - const connections = currentNode.connections.get(layer); - - if (connections) { - for (const neighborId of connections) { - if (visited.has(neighborId)) { - continue; - } - visited.add(neighborId); - - const neighborNode = this.nodes.get(neighborId); - if (!neighborNode) { - continue; // Node was deleted - } - - const neighborDistance = this.distance(query, neighborNode.vector); - - // Add to results if better than worst result or results not full - if (results.length < ef || neighborDistance < results[results.length - 1].distance) { - candidates.push({ id: neighborId, distance: neighborDistance }); - results.push({ id: neighborId, distance: neighborDistance }); - - // Keep results sorted and limited to ef - results.sort((a, b) => a.distance - b.distance); - if (results.length > ef) { - results.pop(); - } - } - } - } - } - - results.sort((a, b) => a.distance - b.distance); - return results; - } - - /** - * Select neighbors using heuristic - */ - private selectNeighbors(candidates: Candidate[], M: number): Candidate[] { - // Sort by distance - candidates.sort((a, b) => a.distance - b.distance); - - // Return top M - return candidates.slice(0, M); - } - - /** - * Prune connections to maintain M limit - */ - private pruneConnections(nodeId: string, layer: number, M: number): void { - const node = this.nodes.get(nodeId); - if (!node) { - return; - } - - const connections = node.connections.get(layer)!; - if (connections.size <= M) { - return; - } - - // Calculate distances to all neighbors - const neighbors: Candidate[] = []; - for (const neighborId of connections) { - const neighbor = this.nodes.get(neighborId)!; - const distance = this.distance(node.vector, neighbor.vector); - neighbors.push({ id: neighborId, distance }); - } - - // Keep closest M neighbors - neighbors.sort((a, b) => a.distance - b.distance); - const toKeep = new Set(neighbors.slice(0, M).map((n) => n.id)); - - // Remove connections not in toKeep - for (const neighborId of connections) { - if (!toKeep.has(neighborId)) { - connections.delete(neighborId); - // Remove reverse connection - const neighbor = this.nodes.get(neighborId); - if (neighbor) { - neighbor.connections.get(layer)?.delete(nodeId); - } - } - } - } - - /** - * Add bidirectional connection between two nodes - */ - private addConnection(fromId: string, toId: string, layer: number): void { - const fromNode = this.nodes.get(fromId); - if (!fromNode) { - return; - } - - const connections = fromNode.connections.get(layer); - if (connections) { - connections.add(toId); - } - } - - /** - * Assign random level using exponential decay - */ - private assignLevel(): number { - const randomValue = Math.random(); - return Math.floor(-Math.log(randomValue) * this.config.levelMultiplier); - } - - /** - * Calculate distance between two vectors - * Uses 1 - cosine similarity to convert similarity to distance - */ - private distance(a: Float32Array, b: Float32Array): number { - return 1 - cosineSimilarity(a, b); - } -} diff --git a/libs/vectoriadb/src/index.ts b/libs/vectoriadb/src/index.ts deleted file mode 100644 index c9d6184..0000000 --- a/libs/vectoriadb/src/index.ts +++ /dev/null @@ -1,56 +0,0 @@ -/** - * VectoriaDB - A lightweight, production-ready in-memory vector database - * - * @packageDocumentation - */ - -export { VectoriaDB } from './vectoria'; -export { EmbeddingService } from './embedding.service'; -export { HNSWIndex } from './hnsw.index'; -export type { HNSWConfig } from './hnsw.index'; -export * from './similarity.utils'; -export * from './regex.utils'; -export * from './error.utils'; -export * from './interfaces'; - -// TF-IDF based lightweight vector database (zero external dependencies) -export { TFIDFVectoria } from './vectoria-tfidf'; -export type { TFIDFDocument, TFIDFVectoriaConfig } from './vectoria-tfidf'; -export { TFIDFEmbeddingService } from './tfidf.embedding.service'; - -// Storage adapters -export { BaseStorageAdapter } from './storage/base.adapter'; -export { MemoryStorageAdapter } from './storage/memory.adapter'; -export { FileStorageAdapter } from './storage/file.adapter'; -export type { FileStorageConfig } from './storage/file.adapter'; -export { RedisStorageAdapter } from './storage/redis.adapter'; -export type { RedisStorageConfig, RedisClient } from './storage/redis.adapter'; -export type { - StorageAdapterConfig, - StorageMetadata, - StoredData, - SerializedEmbedding, -} from './storage/adapter.interface'; - -// Serialization utilities -export { - serializeEmbedding, - deserializeEmbedding, - hash, - createToolsHash, - sanitizeObject, -} from './storage/serialization.utils'; - -// Error classes -export { - VectoriaError, - VectoriaNotInitializedError, - DocumentValidationError, - DocumentNotFoundError, - DocumentExistsError, - DuplicateDocumentError, - QueryValidationError, - EmbeddingError, - StorageError, - ConfigurationError, -} from './errors'; diff --git a/libs/vectoriadb/src/interfaces.ts b/libs/vectoriadb/src/interfaces.ts deleted file mode 100644 index 31b436f..0000000 --- a/libs/vectoriadb/src/interfaces.ts +++ /dev/null @@ -1,271 +0,0 @@ -/** - * Configuration options for VectoriaDB - */ -export interface VectoriaConfig { - /** - * Name of the embedding model to use - * @default 'Xenova/all-MiniLM-L6-v2' - */ - modelName?: string; - - /** - * Directory to cache downloaded models - * @default './.cache/transformers' - */ - cacheDir?: string; - - /** - * Vector dimensions (auto-detected from model if not provided) - */ - dimensions?: number; - - /** - * Default similarity threshold for search results - * @default 0.3 - */ - defaultSimilarityThreshold?: number; - - /** - * Maximum number of results to return by default - * @default 10 - */ - defaultTopK?: number; - - /** - * Enable HNSW index for faster search - * When enabled, provides O(log n) search instead of O(n) brute-force - * @default false - */ - useHNSW?: boolean; - - /** - * HNSW index configuration - */ - hnsw?: { - /** - * Maximum number of connections per node in layer > 0 - * Higher = better recall, more memory - * @default 16 - */ - M?: number; - - /** - * Maximum connections for layer 0 (typically M * 2) - * @default 32 - */ - M0?: number; - - /** - * Size of dynamic candidate list during construction - * Higher = better quality index, slower construction - * @default 200 - */ - efConstruction?: number; - - /** - * Size of dynamic candidate list during search - * Higher = better recall, slower search - * @default 50 - */ - efSearch?: number; - }; - - /** - * Storage adapter for persisting embeddings - * @default MemoryStorageAdapter (no persistence) - */ - storageAdapter?: any; // Will be typed as StorageAdapter in implementation - - /** - * Tools hash for cache invalidation - * Used to detect when tools/documents change - * If not provided, cache won't be invalidated based on content - */ - toolsHash?: string; - - /** - * Version string for cache compatibility - * Used to invalidate cache when version changes - * @default package version - */ - version?: string; - - /** - * Maximum number of documents allowed in the database - * Prevents memory exhaustion attacks - * @default 100000 - */ - maxDocuments?: number; - - /** - * Maximum size of a single document text (in characters) - * Prevents memory exhaustion via huge documents - * @default 1000000 (1 million characters ~1MB) - */ - maxDocumentSize?: number; - - /** - * Maximum number of documents in a single batch operation - * Prevents DoS via massive batch operations - * @default 1000 - */ - maxBatchSize?: number; - - /** - * Enable verbose error messages - * When false (production mode), error messages are sanitized to prevent information disclosure - * When true (development mode), error messages include detailed information for debugging - * @default true (development mode with verbose errors) - */ - verboseErrors?: boolean; -} - -/** - * Metadata associated with a document embedding - * Flexible structure to support any domain - */ -export interface DocumentMetadata { - /** - * Unique identifier for the document - */ - id: string; - - /** - * Additional metadata fields (flexible) - */ - [key: string]: any; -} - -/** - * Stored document embedding with vector and metadata - */ -export interface DocumentEmbedding { - /** - * Unique identifier for this embedding - */ - id: string; - - /** - * Vector representation - */ - vector: Float32Array; - - /** - * Associated metadata - */ - metadata: T; - - /** - * Original text used to generate the embedding - */ - text: string; - - /** - * Timestamp when this embedding was created - */ - createdAt: Date; -} - -/** - * Search filter function - */ -export type FilterFunction = (metadata: T) => boolean; - -/** - * Search options - */ -export interface SearchOptions { - /** - * Maximum number of results to return - * @default 10 - */ - topK?: number; - - /** - * Minimum similarity score threshold (0-1) - * @default 0.3 - */ - threshold?: number; - - /** - * Filter function to apply to metadata - * Returns true if the document should be included - */ - filter?: FilterFunction; - - /** - * Whether to include the vector in results - * @default false - */ - includeVector?: boolean; -} - -/** - * Search result with similarity score - */ -export interface SearchResult { - /** - * Document ID - */ - id: string; - - /** - * Document metadata - */ - metadata: T; - - /** - * Cosine similarity score (0-1, higher is better) - */ - score: number; - - /** - * Original text used for embedding - */ - text: string; - - /** - * Vector (only if includeVector: true) - */ - vector?: Float32Array; -} - -/** - * Statistics about the vector database - */ -export interface VectoriaStats { - /** - * Total number of embeddings - */ - totalEmbeddings: number; - - /** - * Vector dimensions - */ - dimensions: number; - - /** - * Memory usage estimate in bytes - */ - estimatedMemoryBytes: number; - - /** - * Embedding model name - */ - modelName: string; -} - -/** - * Document data for embedding generation - */ -export interface DocumentData { - /** - * Main text content - */ - text: string; - - /** - * Additional metadata (optional) - */ - metadata?: Record; -} diff --git a/libs/vectoriadb/src/regex.utils.ts b/libs/vectoriadb/src/regex.utils.ts deleted file mode 100644 index d50d668..0000000 --- a/libs/vectoriadb/src/regex.utils.ts +++ /dev/null @@ -1,158 +0,0 @@ -/** - * Regular expression utilities with ReDoS (Regular Expression Denial of Service) protection - * - * ReDoS attacks exploit poorly designed regex patterns that can cause catastrophic backtracking, - * leading to exponential time complexity and potential service outages. - * - * Guidelines for safe regex patterns: - * 1. Avoid nested quantifiers: /(a+)+$/ is vulnerable - * 2. Avoid alternation with overlapping patterns: /(a|a)*$/ is vulnerable - * 3. Avoid patterns that can match the same input in multiple ways - * 4. Use atomic groups or possessive quantifiers when available - * 5. Always limit input length before applying complex patterns - * 6. Test patterns with long, repetitive inputs - */ - -/** - * Maximum pattern length to analyze for ReDoS detection. - * Limiting input length prevents the detection patterns themselves from being exploited. - */ -const MAX_PATTERN_LENGTH_FOR_REDOS_CHECK = 500; - -/** - * Pre-compiled patterns for ReDoS detection. - * Uses bounded quantifiers {0,100} instead of unbounded * to prevent - * catastrophic backtracking in the detection patterns themselves. - */ -const REDOS_DETECTION_PATTERNS = { - // Check for nested quantifiers: (a+)+ or (a*)* - nestedQuantifiers: /\([^)]{0,100}[*+{][^)]{0,100}\)[*+{]/, - // Check for alternation with overlapping patterns: (a|ab)* - alternationOverlap: /\([^|]{0,100}\|[^)]{0,100}\)[*+{]/, - // Check for repeated groups with quantifiers: (a+)+ - repeatedGroups: /\([^)]{0,100}[*+][^)]{0,100}\)[*+]/, -} as const; - -/** - * Safely test a pattern against input with length limiting. - * Prevents ReDoS in the detection patterns themselves. - */ -function safePatternTest(input: string, testPattern: RegExp): boolean { - const safeInput = - input.length > MAX_PATTERN_LENGTH_FOR_REDOS_CHECK ? input.slice(0, MAX_PATTERN_LENGTH_FOR_REDOS_CHECK) : input; - try { - return testPattern.test(safeInput); - } catch { - return false; - } -} - -/** - * Detects potentially vulnerable regex patterns - * Checks for common ReDoS patterns like nested quantifiers - * - * @param pattern - The regex pattern to check - * @returns true if the pattern is potentially vulnerable - */ -export function isPotentiallyVulnerableRegex(pattern: string): boolean { - // Check for nested quantifiers: (a+)+ or (a*)* - if (safePatternTest(pattern, REDOS_DETECTION_PATTERNS.nestedQuantifiers)) { - return true; - } - - // Check for alternation with overlapping patterns: (a|ab)* - if (safePatternTest(pattern, REDOS_DETECTION_PATTERNS.alternationOverlap)) { - return true; - } - - // Check for repeated groups with quantifiers: (a+)+ - if (safePatternTest(pattern, REDOS_DETECTION_PATTERNS.repeatedGroups)) { - return true; - } - - return false; -} - -/** - * Creates a safe regex with timeout protection - * Wraps regex execution with a timeout to prevent ReDoS attacks - * - * @param pattern - The regex pattern - * @param flags - Optional regex flags - * @param timeoutMs - Maximum execution time in milliseconds (default: 100ms) - * @returns A function that safely executes the regex - */ -export function createSafeRegex( - pattern: string | RegExp, - flags?: string, - _timeoutMs = 100, -): (input: string) => RegExpMatchArray | null { - const regex = typeof pattern === 'string' ? new RegExp(pattern, flags) : pattern; - - return (input: string): RegExpMatchArray | null => { - // Limit input length as first line of defense - const maxInputLength = 10000; - if (input.length > maxInputLength) { - input = input.substring(0, maxInputLength); - } - - // Note: JavaScript doesn't have built-in regex timeout - // For production use, consider using a worker thread or - // a library like 'safe-regex' for more robust protection - try { - return input.match(regex); - } catch { - // Regex execution failed - return null; - } - }; -} - -/** - * Validates that a string matches a pattern safely - * Limits input length and provides basic ReDoS protection - * - * @param input - The string to validate - * @param pattern - The regex pattern - * @param maxLength - Maximum input length to process (default: 10000) - * @returns true if the input matches the pattern - */ -export function safeTest(input: string, pattern: RegExp, maxLength = 10000): boolean { - if (input.length > maxLength) { - return false; - } - - try { - return pattern.test(input); - } catch { - return false; - } -} - -/** - * Safe regex patterns commonly used in the codebase - * These patterns have been reviewed for ReDoS vulnerabilities - */ -export const SAFE_PATTERNS = { - /** Matches control characters (newlines, tabs, null bytes, etc.) */ - // eslint-disable-next-line no-control-regex - CONTROL_CHARS: /[\r\n\t\0\u000B\u000C]/g, - - /** Matches path separators (forward and backslash) */ - PATH_SEPARATORS: /[/\\]/g, - - /** Matches directory traversal sequences */ - DIR_TRAVERSAL: /\.\./g, - - /** Matches alphanumeric, underscore, and hyphen */ - ALPHANUMERIC_SAFE: /[^a-zA-Z0-9-_]/g, - - /** Matches word characters, colon, dot, and dash */ - REDIS_KEY_SAFE: /[^\w:.-]/g, - - /** Matches leading dots and dashes */ - LEADING_DOTS_DASHES: /^[.-]+/, - - /** Trailing dots and dashes */ - TRAILING_DOTS_DASHES: /[.-]+$/, -} as const; diff --git a/libs/vectoriadb/src/similarity.utils.ts b/libs/vectoriadb/src/similarity.utils.ts deleted file mode 100644 index d8851f6..0000000 --- a/libs/vectoriadb/src/similarity.utils.ts +++ /dev/null @@ -1,103 +0,0 @@ -/** - * Vector similarity utility functions for semantic search - * - * Note: These utilities are intentionally self-contained within vectoriadb - * to maintain the library's standalone, portable nature as a publishable npm package. - */ - -import { EmbeddingError } from './errors'; - -/** - * Calculate cosine similarity between two vectors - * @param a First vector - * @param b Second vector - * @returns Cosine similarity score between -1 and 1 (-1 = opposite, 1 = identical) - */ -export function cosineSimilarity(a: Float32Array, b: Float32Array): number { - if (a.length !== b.length) { - throw new EmbeddingError(`Vector dimensions don't match: ${a.length} vs ${b.length}`); - } - - let dotProduct = 0; - let normA = 0; - let normB = 0; - - for (let i = 0; i < a.length; i++) { - dotProduct += a[i] * b[i]; - normA += a[i] * a[i]; - normB += b[i] * b[i]; - } - - normA = Math.sqrt(normA); - normB = Math.sqrt(normB); - - if (normA === 0 || normB === 0) { - return 0; - } - - return dotProduct / (normA * normB); -} - -/** - * Normalize a vector to unit length - * @param vector Vector to normalize - * @returns Normalized vector (or original vector unchanged if it's a zero vector) - */ -export function normalizeVector(vector: Float32Array): Float32Array { - let norm = 0; - for (let i = 0; i < vector.length; i++) { - norm += vector[i] * vector[i]; - } - norm = Math.sqrt(norm); - - if (norm === 0) { - return vector; - } - - const normalized = new Float32Array(vector.length); - for (let i = 0; i < vector.length; i++) { - normalized[i] = vector[i] / norm; - } - - return normalized; -} - -/** - * Calculate L2 (Euclidean) distance between two vectors - * @param a First vector - * @param b Second vector - * @returns L2 distance - */ -export function euclideanDistance(a: Float32Array, b: Float32Array): number { - if (a.length !== b.length) { - throw new EmbeddingError(`Vector dimensions don't match: ${a.length} vs ${b.length}`); - } - - let sum = 0; - for (let i = 0; i < a.length; i++) { - const diff = a[i] - b[i]; - sum += diff * diff; - } - - return Math.sqrt(sum); -} - -/** - * Calculate dot product of two vectors - * Useful when vectors are already normalized - * @param a First vector - * @param b Second vector - * @returns Dot product - */ -export function dotProduct(a: Float32Array, b: Float32Array): number { - if (a.length !== b.length) { - throw new EmbeddingError(`Vector dimensions don't match: ${a.length} vs ${b.length}`); - } - - let result = 0; - for (let i = 0; i < a.length; i++) { - result += a[i] * b[i]; - } - - return result; -} diff --git a/libs/vectoriadb/src/storage/adapter.interface.ts b/libs/vectoriadb/src/storage/adapter.interface.ts deleted file mode 100644 index 597fbf9..0000000 --- a/libs/vectoriadb/src/storage/adapter.interface.ts +++ /dev/null @@ -1,85 +0,0 @@ -import type { DocumentMetadata } from '../interfaces'; -import { BaseStorageAdapter } from './base.adapter'; - -/** - * Metadata for the stored embeddings - * Used for versioning and invalidation - */ -export interface StorageMetadata { - /** - * Version of VectoriaDB (for compatibility) - */ - version: string; - - /** - * Hash of the tools/documents schema - * Used to invalidate cache when tools change - */ - toolsHash: string; - - /** - * Timestamp when the data was stored - */ - timestamp: number; - - /** - * Model name used for embeddings - */ - modelName: string; - - /** - * Vector dimensions - */ - dimensions: number; - - /** - * Number of documents stored - */ - documentCount: number; -} - -/** - * Stored data with metadata and embeddings - */ -export interface StoredData { - metadata: StorageMetadata; - embeddings: SerializedEmbedding[]; -} - -/** - * Serialized embedding (Float32Array cannot be directly JSON serialized) - */ -export interface SerializedEmbedding { - id: string; - vector: number[]; // Float32Array serialized as number[] - metadata: T; - text: string; - createdAt: string; // Date serialized as ISO string -} - -/** - * Configuration for storage adapters - */ -export interface StorageAdapterConfig { - /** - * Namespace/prefix for storage keys - * Useful for multi-tenant scenarios - */ - namespace?: string; - - /** - * Whether to automatically save on changes - * @default false - */ - autoSave?: boolean; - - /** - * Interval for auto-save in milliseconds - * Only used if autoSave is true - * @default 60000 (1 minute) - */ - autoSaveInterval?: number; -} - -// Export base class for adapters to extend -export { BaseStorageAdapter }; diff --git a/libs/vectoriadb/src/storage/base.adapter.ts b/libs/vectoriadb/src/storage/base.adapter.ts deleted file mode 100644 index f478dc2..0000000 --- a/libs/vectoriadb/src/storage/base.adapter.ts +++ /dev/null @@ -1,146 +0,0 @@ -import type { DocumentEmbedding, DocumentMetadata } from '../interfaces'; -import type { StorageAdapterConfig, StoredData, StorageMetadata, SerializedEmbedding } from './adapter.interface'; -import * as SerializationUtils from './serialization.utils'; - -/** - * Abstract base class for storage adapters - * Provides common functionality and utilities to reduce code duplication - */ -export abstract class BaseStorageAdapter { - protected config: Required; - - constructor(config: StorageAdapterConfig = {}) { - this.config = { - namespace: config.namespace ?? 'default', - autoSave: config.autoSave ?? false, - autoSaveInterval: config.autoSaveInterval ?? 60000, - }; - } - - /** - * Initialize the storage adapter - */ - abstract initialize(): Promise; - - /** - * Load embeddings from storage - */ - abstract load(): Promise | null>; - - /** - * Save embeddings to storage - */ - abstract save(data: StoredData): Promise; - - /** - * Clear all stored data - */ - abstract clear(): Promise; - - /** - * Close/cleanup the adapter - */ - abstract close(): Promise; - - /** - * Check if cached data exists and is valid - * Common implementation that works for most adapters - */ - async hasValidCache(metadata: StorageMetadata): Promise { - try { - const data = await this.load(); - if (!data) { - return false; - } - - return this.isMetadataValid(data.metadata, metadata); - } catch { - return false; - } - } - - /** - * Validate if cached metadata matches current metadata - * Checks version, toolsHash, and modelName - */ - protected isMetadataValid(cachedMetadata: StorageMetadata, currentMetadata: StorageMetadata): boolean { - // Check if version matches - if (cachedMetadata.version !== currentMetadata.version) { - return false; - } - - // Check if tools hash matches (invalidate if tools changed) - if (cachedMetadata.toolsHash !== currentMetadata.toolsHash) { - return false; - } - - // Check if model name matches - if (cachedMetadata.modelName !== currentMetadata.modelName) { - return false; - } - - return true; - } - - /** - * Serialize a DocumentEmbedding to a SerializedEmbedding - */ - protected serializeEmbedding(embedding: DocumentEmbedding): SerializedEmbedding { - return SerializationUtils.serializeEmbedding(embedding); - } - - /** - * Deserialize a SerializedEmbedding to a DocumentEmbedding - * Sanitizes metadata to prevent prototype pollution - */ - protected deserializeEmbedding(serialized: SerializedEmbedding): DocumentEmbedding { - return SerializationUtils.deserializeEmbedding(serialized); - } - - /** - * Create a cryptographic hash from a string using SHA-256 - * More secure than simple hash - prevents collision attacks - */ - protected hash(input: string): string { - return SerializationUtils.hash(input); - } - - /** - * Create a hash from document IDs and texts - * Used to detect when tools/documents change - */ - protected createToolsHash(documents: Array<{ id: string; text: string }>): string { - return SerializationUtils.createToolsHash(documents); - } - - /** - * Safely parse JSON with error handling and prototype pollution protection - */ - protected safeJsonParse(content: string): R | null { - try { - const parsed = JSON.parse(content, (key, value) => { - // Block prototype pollution - if (key === '__proto__' || key === 'constructor' || key === 'prototype') { - return undefined; - } - return value; - }); - - // Additional sanitization for nested objects - return SerializationUtils.sanitizeObject(parsed) as R; - } catch { - return null; - } - } - - /** - * Safely stringify JSON with error handling - */ - protected safeJsonStringify(data: unknown, pretty = false): string | null { - try { - return JSON.stringify(data, null, pretty ? 2 : undefined); - } catch { - return null; - } - } -} diff --git a/libs/vectoriadb/src/storage/file.adapter.ts b/libs/vectoriadb/src/storage/file.adapter.ts deleted file mode 100644 index 5e759f5..0000000 --- a/libs/vectoriadb/src/storage/file.adapter.ts +++ /dev/null @@ -1,141 +0,0 @@ -import * as fs from 'fs/promises'; -import * as path from 'path'; -import type { DocumentMetadata } from '../interfaces'; -import type { StorageAdapterConfig, StoredData } from './adapter.interface'; -import { BaseStorageAdapter } from './base.adapter'; -import { ConfigurationError, StorageError } from '../errors'; - -/** - * Configuration for file storage adapter - */ -export interface FileStorageConfig extends StorageAdapterConfig { - /** - * Directory to store cache files - * @default './.cache/vectoriadb' - */ - cacheDir?: string; - - /** - * File name for the cache - * @default 'embeddings.json' - */ - fileName?: string; -} - -/** - * File-based storage adapter - * Stores embeddings in a JSON file with hash-based invalidation - * Perfect for local development to avoid recalculating embeddings - */ -export class FileStorageAdapter extends BaseStorageAdapter { - private fileConfig: Required>; - private filePath: string; - - constructor(config: FileStorageConfig = {}) { - super(config); - - // Sanitize namespace to prevent path traversal - const sanitizedNamespace = this.sanitizeNamespace(this.config.namespace); - - this.fileConfig = { - cacheDir: config.cacheDir ?? './.cache/vectoriadb', - fileName: config.fileName ?? 'embeddings.json', - }; - - this.filePath = path.join(this.fileConfig.cacheDir, sanitizedNamespace, this.fileConfig.fileName); - - // Verify the resolved path is still within cacheDir (path traversal protection) - this.validateFilePath(); - } - - /** - * Sanitize namespace to prevent path traversal attacks - * Removes dangerous characters and path traversal sequences - */ - private sanitizeNamespace(namespace: string): string { - return ( - namespace - // Remove path traversal sequences - .replace(/\.\./g, '') - // Replace path separators with hyphens - .replace(/[/\\]/g, '-') - // Remove leading dots and hyphens - .replace(/^[.-]+/, '') - // Remove trailing dots and hyphens - .replace(/[.-]+$/, '') - // Remove any remaining dangerous characters - .replace(/[^a-zA-Z0-9-_]/g, '') - // Limit length - .substring(0, 100) || 'default' - ); - } - - /** - * Validate that the file path doesn't escape the cache directory - */ - private validateFilePath(): void { - const resolvedPath = path.resolve(this.filePath); - const resolvedCacheDir = path.resolve(this.fileConfig.cacheDir); - - if (!resolvedPath.startsWith(resolvedCacheDir + path.sep) && resolvedPath !== resolvedCacheDir) { - throw new ConfigurationError( - `Invalid namespace: path traversal detected. ` + `Resolved path must be within cache directory.`, - ); - } - } - - override async initialize(): Promise { - // Ensure cache directory exists - const dir = path.dirname(this.filePath); - try { - await fs.mkdir(dir, { recursive: true }); - } catch (error) { - // With recursive:true, EEXIST shouldn't occur in modern Node.js - // Surface real errors like permission denials or disk full - throw new StorageError( - `Failed to create cache directory: ${error instanceof Error ? error.message : String(error)}`, - error instanceof Error ? error : undefined, - ); - } - } - - override async load(): Promise | null> { - try { - const content = await fs.readFile(this.filePath, 'utf-8'); - return this.safeJsonParse>(content); - } catch { - // File doesn't exist or is invalid - return null; - } - } - - override async save(data: StoredData): Promise { - try { - const content = this.safeJsonStringify(data, true); - if (!content) { - throw new StorageError('Failed to serialize embeddings data'); - } - await fs.writeFile(this.filePath, content, 'utf-8'); - } catch (error) { - if (error instanceof StorageError) { - throw error; - } - throw new StorageError( - `Failed to save embeddings to file: ${error instanceof Error ? error.message : String(error)}`, - error instanceof Error ? error : undefined, - ); - } - } - - override async clear(): Promise { - try { - await fs.unlink(this.filePath); - } catch { - // File doesn't exist, ignore - } - } - - override async close(): Promise { - // No cleanup needed for file adapter - } -} diff --git a/libs/vectoriadb/src/storage/memory.adapter.ts b/libs/vectoriadb/src/storage/memory.adapter.ts deleted file mode 100644 index 33c58cb..0000000 --- a/libs/vectoriadb/src/storage/memory.adapter.ts +++ /dev/null @@ -1,40 +0,0 @@ -import type { DocumentMetadata } from '../interfaces'; -import type { StorageAdapterConfig, StoredData, StorageMetadata } from './adapter.interface'; -import { BaseStorageAdapter } from './base.adapter'; - -/** - * In-memory storage adapter (no persistence) - * This is the default adapter - data is lost on restart - */ -export class MemoryStorageAdapter extends BaseStorageAdapter { - private data: StoredData | null = null; - - constructor(config: StorageAdapterConfig = {}) { - super(config); - } - - override async initialize(): Promise { - // No initialization needed for memory adapter - } - - override async hasValidCache(_metadata: StorageMetadata): Promise { - // Memory adapter never has cached data on startup - return false; - } - - override async load(): Promise | null> { - return this.data; - } - - override async save(data: StoredData): Promise { - this.data = data; - } - - override async clear(): Promise { - this.data = null; - } - - override async close(): Promise { - this.data = null; - } -} diff --git a/libs/vectoriadb/src/storage/redis.adapter.ts b/libs/vectoriadb/src/storage/redis.adapter.ts deleted file mode 100644 index 23036d1..0000000 --- a/libs/vectoriadb/src/storage/redis.adapter.ts +++ /dev/null @@ -1,154 +0,0 @@ -import type { DocumentMetadata } from '../interfaces'; -import type { StorageAdapterConfig, StoredData } from './adapter.interface'; -import { BaseStorageAdapter } from './base.adapter'; -import { ConfigurationError, StorageError } from '../errors'; -import { SAFE_PATTERNS } from '../regex.utils'; - -/** - * Redis client interface (compatible with ioredis, redis, etc.) - */ -export interface RedisClient { - get(key: string): Promise; - set(key: string, value: string): Promise; - setex(key: string, seconds: number, value: string): Promise; - del(key: string): Promise; - ping(): Promise; - quit(): Promise; -} - -/** - * Configuration for Redis storage adapter - */ -export interface RedisStorageConfig extends StorageAdapterConfig { - /** - * Redis client instance - */ - client: RedisClient; - - /** - * TTL for cached data in seconds - * @default 86400 (24 hours) - */ - ttl?: number; - - /** - * Key prefix for Redis keys - * @default 'vectoriadb' - */ - keyPrefix?: string; -} - -/** - * Redis-based storage adapter - * Stores embeddings in Redis for distributed caching - * Perfect for multi-pod environments to share embeddings - */ -export class RedisStorageAdapter extends BaseStorageAdapter { - private redisConfig: Required>; - private redisKey: string; - - constructor(config: RedisStorageConfig) { - super(config); - - this.redisConfig = { - client: config.client, - ttl: config.ttl ?? 86400, // 24 hours default - keyPrefix: config.keyPrefix ?? 'vectoriadb', - }; - - // Sanitize namespace to prevent Redis command injection - const sanitizedNamespace = this.sanitizeNamespace(this.config.namespace); - const sanitizedKeyPrefix = this.sanitizeNamespace(this.redisConfig.keyPrefix); - - this.redisKey = `${sanitizedKeyPrefix}:${sanitizedNamespace}`; - } - - /** - * Sanitize namespace/key prefix to prevent Redis command injection - * Removes dangerous characters like newlines, carriage returns, and other control characters - * @private - */ - private sanitizeNamespace(namespace: string): string { - if (!namespace || typeof namespace !== 'string') { - throw new ConfigurationError('Namespace must be a non-empty string'); - } - - // Limit length FIRST to prevent ReDoS on uncontrolled input - const maxLength = 200; - const boundedNamespace = namespace.length > maxLength ? namespace.slice(0, maxLength) : namespace; - - // Remove newlines, carriage returns, and other control characters - // These could be used for command injection in Redis - // Uses pre-compiled safe patterns from regex.utils to prevent ReDoS - const sanitized = boundedNamespace - .replace(SAFE_PATTERNS.CONTROL_CHARS, '') // Remove control characters - .replace(SAFE_PATTERNS.REDIS_KEY_SAFE, '-') // Replace unsafe chars with dash - .replace(SAFE_PATTERNS.LEADING_DOTS_DASHES, '') // Remove leading dots and dashes - .replace(SAFE_PATTERNS.TRAILING_DOTS_DASHES, ''); // Remove trailing dots and dashes - - if (!sanitized) { - throw new ConfigurationError('Namespace becomes empty after sanitization'); - } - - return sanitized; - } - - override async initialize(): Promise { - // Test Redis connection - try { - await this.redisConfig.client.ping(); - } catch (error) { - throw new StorageError( - `Failed to connect to Redis: ${error instanceof Error ? error.message : String(error)}`, - error instanceof Error ? error : undefined, - ); - } - } - - override async load(): Promise | null> { - try { - const content = await this.redisConfig.client.get(this.redisKey); - if (!content) { - return null; - } - - return this.safeJsonParse>(content); - } catch { - // Redis error or invalid JSON - return null; - } - } - - override async save(data: StoredData): Promise { - try { - const content = this.safeJsonStringify(data); - if (!content) { - throw new StorageError('Failed to serialize embeddings data'); - } - - // Use SETEX to set with TTL - await this.redisConfig.client.setex(this.redisKey, this.redisConfig.ttl, content); - } catch (error) { - if (error instanceof StorageError) { - throw error; - } - throw new StorageError( - `Failed to save embeddings to Redis: ${error instanceof Error ? error.message : String(error)}`, - error instanceof Error ? error : undefined, - ); - } - } - - override async clear(): Promise { - try { - await this.redisConfig.client.del(this.redisKey); - } catch { - // Key doesn't exist, ignore - } - } - - override async close(): Promise { - // No-op: Users manage the Redis client lifecycle themselves - // The client is externally owned and may be shared across the application - } -} diff --git a/libs/vectoriadb/src/storage/serialization.utils.ts b/libs/vectoriadb/src/storage/serialization.utils.ts deleted file mode 100644 index 949c812..0000000 --- a/libs/vectoriadb/src/storage/serialization.utils.ts +++ /dev/null @@ -1,88 +0,0 @@ -import * as crypto from 'crypto'; -import type { DocumentEmbedding, DocumentMetadata } from '../interfaces'; -import type { SerializedEmbedding } from './adapter.interface'; - -/** - * Serialize a DocumentEmbedding to a SerializedEmbedding - * Converts Float32Array to regular array and Date to ISO string - */ -export function serializeEmbedding( - embedding: DocumentEmbedding, -): SerializedEmbedding { - return { - id: embedding.id, - vector: Array.from(embedding.vector), - metadata: embedding.metadata, - text: embedding.text, - createdAt: embedding.createdAt.toISOString(), - }; -} - -/** - * Deserialize a SerializedEmbedding to a DocumentEmbedding - * Sanitizes metadata to prevent prototype pollution - */ -export function deserializeEmbedding( - serialized: SerializedEmbedding, -): DocumentEmbedding { - // Sanitize metadata to prevent prototype pollution - const sanitizedMetadata = sanitizeObject(serialized.metadata) as T; - - return { - id: serialized.id, - vector: new Float32Array(serialized.vector), - metadata: sanitizedMetadata, - text: serialized.text, - createdAt: new Date(serialized.createdAt), - }; -} - -/** - * Create a hash from a string (simple implementation) - */ -export function hash(input: string): string { - return crypto.createHash('sha256').update(input, 'utf8').digest('hex').substring(0, 16); -} - -/** - * Create a hash from document IDs and texts - * Used to detect when tools/documents change - */ -export function createToolsHash(documents: Array<{ id: string; text: string }>): string { - const content = documents - .sort((a, b) => a.id.localeCompare(b.id)) - .map((d) => `${d.id}:${d.text}`) - .join('|'); - return hash(content); -} - -/** - * Sanitize an object to prevent prototype pollution - * Creates a clean object without dangerous properties - */ -export function sanitizeObject(obj: any): any { - if (obj === null || typeof obj !== 'object') { - return obj; - } - - // Handle arrays - if (Array.isArray(obj)) { - return obj.map((item) => sanitizeObject(item)); - } - - // Create clean object without prototype chain - const clean: any = {}; - - // Copy only safe properties - for (const key of Object.keys(obj)) { - // Block dangerous keys - if (key === '__proto__' || key === 'constructor' || key === 'prototype') { - continue; - } - - // Recursively sanitize nested objects - clean[key] = sanitizeObject(obj[key]); - } - - return clean; -} diff --git a/libs/vectoriadb/src/tfidf.embedding.service.ts b/libs/vectoriadb/src/tfidf.embedding.service.ts deleted file mode 100644 index 7a8c11a..0000000 --- a/libs/vectoriadb/src/tfidf.embedding.service.ts +++ /dev/null @@ -1,175 +0,0 @@ -/** - * Lightweight TF-IDF based embedding service for semantic search - * - * This provides a simple, synchronous alternative to the ML-based EmbeddingService - * Ideal for use cases where: - * - You don't want to download ML models - * - You need synchronous operation - * - You have a small to medium corpus (< 10K documents) - * - You want zero external dependencies beyond Node.js - * - * Note: For production semantic search with larger corpora, use the ML-based - * EmbeddingService which provides better quality embeddings via transformers.js - */ -export class TFIDFEmbeddingService { - private vocabulary: Map = new Map(); - private idf: Map = new Map(); - private documentCount = 0; - - /** - * Tokenizes and normalizes text into terms - */ - tokenize(text: string): string[] { - return text - .toLowerCase() - .replace(/[^\w\s]/g, ' ') - .split(/\s+/) - .filter((term) => term.length > 1); // Filter out single characters - } - - /** - * Computes term frequency for a document - */ - private computeTermFrequency(terms: string[]): Map { - const tf = new Map(); - const totalTerms = terms.length; - - // Handle empty document edge case - if (totalTerms === 0) { - return tf; - } - - for (const term of terms) { - tf.set(term, (tf.get(term) || 0) + 1); - } - - // Normalize by document length - for (const [term, count] of tf.entries()) { - tf.set(term, count / totalTerms); - } - - return tf; - } - - /** - * Updates the IDF (Inverse Document Frequency) values - * This should be called whenever documents are added to the corpus - */ - updateIDF(documents: string[][]): void { - this.documentCount = documents.length; - const documentFrequency = new Map(); - - // Count how many documents contain each term - for (const terms of documents) { - const uniqueTerms = new Set(terms); - for (const term of uniqueTerms) { - documentFrequency.set(term, (documentFrequency.get(term) || 0) + 1); - } - } - - // Compute IDF: log(N / df) - for (const [term, df] of documentFrequency.entries()) { - this.idf.set(term, Math.log(this.documentCount / df)); - } - - // Build vocabulary - let index = 0; - for (const term of this.idf.keys()) { - if (!this.vocabulary.has(term)) { - this.vocabulary.set(term, index++); - } - } - } - - /** - * Generates a TF-IDF vector for a given text - * Returns a sparse vector representation as a Map - */ - embed(text: string): Map { - const terms = this.tokenize(text); - const tf = this.computeTermFrequency(terms); - const vector = new Map(); - - for (const [term, tfValue] of tf.entries()) { - const idfValue = this.idf.get(term) || 0; - if (idfValue > 0) { - vector.set(term, tfValue * idfValue); - } - } - - return vector; - } - - /** - * Converts a sparse vector to a dense Float32Array - * Uses the internal vocabulary for dimension mapping - * Missing terms are filled with zeros - */ - toDenseVector(sparseVector: Map): Float32Array { - const dimensions = this.vocabulary.size; - const dense = new Float32Array(dimensions); - - for (const [term, weight] of sparseVector.entries()) { - const index = this.vocabulary.get(term); - if (index !== undefined) { - dense[index] = weight; - } - } - - return dense; - } - - /** - * Computes cosine similarity between two sparse vectors - * More efficient than converting to dense vectors for TF-IDF - */ - cosineSimilarity(vector1: Map, vector2: Map): number { - let dotProduct = 0; - let magnitude1 = 0; - let magnitude2 = 0; - - // Compute dot product and magnitude of vector1 - for (const [term, value] of vector1.entries()) { - magnitude1 += value * value; - const value2 = vector2.get(term) || 0; - dotProduct += value * value2; - } - - // Compute magnitude of vector2 - for (const value of vector2.values()) { - magnitude2 += value * value; - } - - magnitude1 = Math.sqrt(magnitude1); - magnitude2 = Math.sqrt(magnitude2); - - if (magnitude1 === 0 || magnitude2 === 0) { - return 0; - } - - return dotProduct / (magnitude1 * magnitude2); - } - - /** - * Get the size of the vocabulary - */ - getVocabularySize(): number { - return this.vocabulary.size; - } - - /** - * Get the number of documents in the corpus - */ - getDocumentCount(): number { - return this.documentCount; - } - - /** - * Clear the IDF and vocabulary (useful for rebuilding the index) - */ - clear(): void { - this.vocabulary.clear(); - this.idf.clear(); - this.documentCount = 0; - } -} diff --git a/libs/vectoriadb/src/vectoria-tfidf.ts b/libs/vectoriadb/src/vectoria-tfidf.ts deleted file mode 100644 index 673b673..0000000 --- a/libs/vectoriadb/src/vectoria-tfidf.ts +++ /dev/null @@ -1,268 +0,0 @@ -import { TFIDFEmbeddingService } from './tfidf.embedding.service'; -import type { DocumentMetadata, SearchOptions, SearchResult } from './interfaces'; - -/** - * Document with TF-IDF sparse vector representation - */ -export interface TFIDFDocument { - /** - * Unique identifier for this document - */ - id: string; - - /** - * Sparse TF-IDF vector representation - */ - vector: Map; - - /** - * Associated metadata - */ - metadata: T; - - /** - * Original text used to generate the embedding - */ - text: string; - - /** - * Timestamp when this document was created - */ - createdAt: Date; -} - -/** - * Configuration for TF-IDF based VectoriaDB - */ -export interface TFIDFVectoriaConfig { - /** - * Default similarity threshold for search results - * @default 0.0 - */ - defaultSimilarityThreshold?: number; - - /** - * Maximum number of results to return by default - * @default 10 - */ - defaultTopK?: number; -} - -/** - * Lightweight TF-IDF based vector database - * - * A synchronous, zero-dependency alternative to the ML-based VectoriaDB - * Perfect for: - * - Small to medium corpora (< 10K documents) - * - Scenarios where ML model downloads are not acceptable - * - Use cases requiring synchronous operation - * - Keyword/term-based semantic search - * - * Limitations compared to ML-based VectoriaDB: - * - Less semantic understanding (synonyms, context) - * - Better for exact term matching - * - Requires reindexing when corpus changes - * - * @example - * ```ts - * const db = new TFIDFVectoria<{ appId: string }>(); - * - * // Add documents - * db.addDocument('tool1', 'User authentication tool', { appId: 'auth', id: 'tool1' }); - * db.addDocument('tool2', 'User profile retrieval', { appId: 'user', id: 'tool2' }); - * - * // Reindex after adding documents - * db.reindex(); - * - * // Search - * const results = db.search('authentication', { topK: 5 }); - * ``` - */ -export class TFIDFVectoria { - private documents: Map>; - private embeddingService: TFIDFEmbeddingService; - private config: Required; - private needsReindex = false; - - constructor(config: TFIDFVectoriaConfig = {}) { - this.documents = new Map(); - this.embeddingService = new TFIDFEmbeddingService(); - - this.config = { - defaultSimilarityThreshold: config.defaultSimilarityThreshold ?? 0.0, - defaultTopK: config.defaultTopK ?? 10, - }; - } - - /** - * Add a document to the database - * Note: You must call reindex() after adding documents for the IDF to be updated - */ - addDocument(id: string, text: string, metadata: T): void { - this.documents.set(id, { - id, - vector: new Map(), // Will be computed during reindex - metadata, - text, - createdAt: new Date(), - }); - - this.needsReindex = true; - } - - /** - * Add multiple documents in batch - * Note: You must call reindex() after adding documents for the IDF to be updated - */ - addDocuments(documents: Array<{ id: string; text: string; metadata: T }>): void { - for (const doc of documents) { - this.documents.set(doc.id, { - id: doc.id, - vector: new Map(), - metadata: doc.metadata, - text: doc.text, - createdAt: new Date(), - }); - } - - this.needsReindex = true; - } - - /** - * Remove a document from the database - * Note: You must call reindex() after removing documents for the IDF to be updated - */ - removeDocument(id: string): boolean { - const deleted = this.documents.delete(id); - if (deleted) { - this.needsReindex = true; - } - return deleted; - } - - /** - * Get a document by ID - */ - getDocument(id: string): TFIDFDocument | undefined { - return this.documents.get(id); - } - - /** - * Check if a document exists - */ - hasDocument(id: string): boolean { - return this.documents.has(id); - } - - /** - * Get all document IDs - */ - getAllDocumentIds(): string[] { - return Array.from(this.documents.keys()); - } - - /** - * Get total number of documents - */ - getDocumentCount(): number { - return this.documents.size; - } - - /** - * Rebuild the IDF values and embeddings for all documents - * Must be called after adding/removing documents - */ - reindex(): void { - if (!this.needsReindex) return; - - const documentTexts: string[][] = []; - const entries = Array.from(this.documents.values()); - - // Tokenize all documents - for (const entry of entries) { - documentTexts.push(this.embeddingService.tokenize(entry.text)); - } - - // Update IDF values - this.embeddingService.updateIDF(documentTexts); - - // Recompute vectors for all documents - for (const entry of entries) { - entry.vector = this.embeddingService.embed(entry.text); - } - - this.needsReindex = false; - } - - /** - * Check if reindexing is needed - */ - needsReindexing(): boolean { - return this.needsReindex; - } - - /** - * Search for documents matching the query - */ - search(query: string, options: SearchOptions = {}): SearchResult[] { - const { topK = this.config.defaultTopK, threshold = this.config.defaultSimilarityThreshold, filter } = options; - - // Reindex if needed - if (this.needsReindex) { - this.reindex(); - } - - // Generate query vector - const queryVector = this.embeddingService.embed(query); - - const results: SearchResult[] = []; - - // Compute similarity for each document - for (const doc of this.documents.values()) { - // Apply metadata filter if provided - if (filter && !filter(doc.metadata)) { - continue; - } - - // Compute similarity - const score = this.embeddingService.cosineSimilarity(queryVector, doc.vector); - - if (score >= threshold) { - results.push({ - id: doc.id, - metadata: doc.metadata, - score, - text: doc.text, - }); - } - } - - // Sort by score (descending) and return top K - results.sort((a, b) => b.score - a.score); - return results.slice(0, topK); - } - - /** - * Clear all documents and reset the index - */ - clear(): void { - this.documents.clear(); - this.embeddingService.clear(); - this.needsReindex = false; - } - - /** - * Get statistics about the database - */ - getStats(): { - documentCount: number; - vocabularySize: number; - needsReindex: boolean; - } { - return { - documentCount: this.documents.size, - vocabularySize: this.embeddingService.getVocabularySize(), - needsReindex: this.needsReindex, - }; - } -} diff --git a/libs/vectoriadb/src/vectoria.ts b/libs/vectoriadb/src/vectoria.ts deleted file mode 100644 index 7e658e0..0000000 --- a/libs/vectoriadb/src/vectoria.ts +++ /dev/null @@ -1,771 +0,0 @@ -import { EmbeddingService } from './embedding.service'; -import { cosineSimilarity } from './similarity.utils'; -import { HNSWIndex } from './hnsw.index'; -import type { - VectoriaConfig, - DocumentEmbedding, - DocumentMetadata, - SearchOptions, - SearchResult, - VectoriaStats, -} from './interfaces'; -import { BaseStorageAdapter, StorageMetadata, StoredData } from './storage/adapter.interface'; -import * as SerializationUtils from './storage/serialization.utils'; -import { MemoryStorageAdapter } from './storage/memory.adapter'; -import { - VectoriaNotInitializedError, - DocumentValidationError, - DocumentNotFoundError, - DocumentExistsError, - DuplicateDocumentError, - QueryValidationError, - EmbeddingError, -} from './errors'; - -/** - * VectoriaDB - A lightweight, production-ready in-memory vector database - * - * Features: - * - Semantic search using cosine similarity - * - Flexible metadata filtering - * - Batch operations for efficiency - * - TypeScript generic support for type-safe metadata - */ -export class VectoriaDB { - private embeddings: Map>; - private embeddingService: EmbeddingService; - private config: Required; - private hnswIndex: HNSWIndex | null; - private storageAdapter: BaseStorageAdapter; - - constructor(config: VectoriaConfig = {}) { - this.embeddings = new Map(); - this.embeddingService = new EmbeddingService(config.modelName, config.cacheDir); - - this.config = { - modelName: config.modelName ?? 'Xenova/all-MiniLM-L6-v2', - cacheDir: config.cacheDir ?? './.cache/transformers', - dimensions: config.dimensions ?? 384, - defaultSimilarityThreshold: config.defaultSimilarityThreshold ?? 0.3, - defaultTopK: config.defaultTopK ?? 10, - useHNSW: config.useHNSW ?? false, - hnsw: config.hnsw ?? {}, - storageAdapter: config.storageAdapter, - toolsHash: config.toolsHash ?? '', - version: config.version ?? '1.0.0', - maxDocuments: config.maxDocuments ?? 100000, - maxDocumentSize: config.maxDocumentSize ?? 1000000, - maxBatchSize: config.maxBatchSize ?? 1000, - verboseErrors: config.verboseErrors ?? true, - }; - - // Initialize HNSW index if enabled - if (this.config.useHNSW) { - this.hnswIndex = new HNSWIndex(this.config.hnsw); - } else { - this.hnswIndex = null; - } - - // Initialize storage adapter (default to in-memory) - this.storageAdapter = config.storageAdapter ?? new MemoryStorageAdapter(); - } - - /** - * Initialize the vector database - * Must be called before using the database - * Automatically loads from cache if available and valid - */ - async initialize(): Promise { - // Initialize embedding service - await this.embeddingService.initialize(); - this.config.dimensions = this.embeddingService.getDimensions(); - - // Initialize storage adapter - await this.storageAdapter.initialize(); - - // Try to load from cache - const loaded = await this.loadFromStorage(); - if (loaded) { - // Successfully loaded from cache - return; - } - - // No valid cache, continue with empty database - } - - /** - * Check if the database is initialized - */ - isInitialized(): boolean { - return this.embeddingService.isReady(); - } - - /** - * Add a document to the vector database - * @throws Error if database is not initialized, document ID already exists, or text is empty - */ - async add(id: string, text: string, metadata: T): Promise { - if (!this.isInitialized()) { - throw new VectoriaNotInitializedError('adding documents'); - } - - // Check document count limit (DoS protection) - if (this.embeddings.size >= this.config.maxDocuments) { - throw new DocumentValidationError( - `Document limit exceeded. Maximum allowed documents: ${this.config.maxDocuments}`, - id, - ); - } - - if (!text || !text.trim()) { - throw new DocumentValidationError('Document text cannot be empty or whitespace-only', id); - } - - // Check document size limit (DoS protection) - if (text.length > this.config.maxDocumentSize) { - throw new DocumentValidationError( - `Document text exceeds maximum size. Maximum allowed: ${this.config.maxDocumentSize} characters`, - id, - ); - } - - if (this.embeddings.has(id)) { - throw new DocumentExistsError(id); - } - - if (metadata.id !== id) { - throw new DocumentValidationError(`Metadata id "${metadata.id}" does not match document id "${id}"`, id); - } - - // Generate embedding - const vector = await this.embeddingService.generateEmbedding(text); - - // Create embedding object - const embedding: DocumentEmbedding = { - id, - vector, - metadata, - text, - createdAt: new Date(), - }; - - // Store embedding - this.embeddings.set(id, embedding); - - // Add to HNSW index if enabled - if (this.hnswIndex) { - this.hnswIndex.insert(id, vector); - } - } - - /** - * Add multiple documents in batch - * @throws Error if database is not initialized, any document ID already exists, or any text is empty - */ - async addMany(documents: Array<{ id: string; text: string; metadata: T }>): Promise { - if (!this.isInitialized()) { - throw new VectoriaNotInitializedError('adding documents'); - } - - // Check batch size limit (DoS protection) - if (documents.length > this.config.maxBatchSize) { - throw new DocumentValidationError( - `Batch size exceeds maximum allowed. Maximum: ${this.config.maxBatchSize}, provided: ${documents.length}`, - ); - } - - // Check if adding these documents would exceed the total document limit (DoS protection) - const newTotal = this.embeddings.size + documents.length; - if (newTotal > this.config.maxDocuments) { - throw new DocumentValidationError( - `Adding ${documents.length} documents would exceed maximum document limit. Current: ${this.embeddings.size}, Maximum: ${this.config.maxDocuments}`, - ); - } - - // Check for duplicate IDs within the batch and validate text - const ids = new Set(); - for (const doc of documents) { - if (!doc.text || !doc.text.trim()) { - throw new DocumentValidationError(`Document with id "${doc.id}" has empty or whitespace-only text`, doc.id); - } - // Check document size limit (DoS protection) - if (doc.text.length > this.config.maxDocumentSize) { - throw new DocumentValidationError( - `Document with id "${doc.id}" exceeds maximum size. Maximum allowed: ${this.config.maxDocumentSize} characters`, - doc.id, - ); - } - if (doc.metadata.id !== doc.id) { - throw new DocumentValidationError( - `Document with id "${doc.id}": metadata.id "${doc.metadata.id}" does not match document id`, - doc.id, - ); - } - if (ids.has(doc.id)) { - throw new DuplicateDocumentError(doc.id, 'batch'); - } - if (this.embeddings.has(doc.id)) { - throw new DuplicateDocumentError(doc.id, 'existing'); - } - ids.add(doc.id); - } - - // Extract texts - const texts = documents.map((d) => d.text); - - // Generate embeddings in batch - const vectors = await this.embeddingService.generateEmbeddings(texts); - - // Defensive check: ensure vectors match documents - if (vectors.length !== documents.length) { - throw new EmbeddingError( - `Embedding generation mismatch: expected ${documents.length} vectors, got ${vectors.length}`, - ); - } - - // Store embeddings - for (let i = 0; i < documents.length; i++) { - const { id, text, metadata } = documents[i]; - const vector = vectors[i]; - - const embedding: DocumentEmbedding = { - id, - vector, - metadata, - text, - createdAt: new Date(), - }; - - this.embeddings.set(id, embedding); - - // Add to HNSW index if enabled - if (this.hnswIndex) { - this.hnswIndex.insert(id, vector); - } - } - } - - /** - * Search for documents using semantic similarity - * @throws Error if database is not initialized, query is empty, or search parameters are invalid - */ - async search(query: string, options: SearchOptions = {}): Promise[]> { - if (!this.isInitialized()) { - throw new VectoriaNotInitializedError('searching'); - } - - if (!query || !query.trim()) { - throw new QueryValidationError('Search query cannot be empty or whitespace-only'); - } - - // Get threshold and topK - const threshold = options.threshold ?? this.config.defaultSimilarityThreshold; - const topK = options.topK ?? this.config.defaultTopK; - - if (topK <= 0) { - throw new QueryValidationError('topK must be a positive number'); - } - - if (threshold < 0 || threshold > 1) { - throw new QueryValidationError('threshold must be between 0 and 1'); - } - - // Generate query embedding - const queryVector = await this.embeddingService.generateEmbedding(query); - - // Use HNSW index if enabled - if (this.hnswIndex) { - return this.searchWithHNSW(queryVector, topK, threshold, options); - } - - // Fallback to brute-force search - return this.searchBruteForce(queryVector, topK, threshold, options); - } - - /** - * Search using HNSW index (approximate nearest neighbor) - */ - private searchWithHNSW( - queryVector: Float32Array, - topK: number, - threshold: number, - options: SearchOptions, - ): SearchResult[] { - // Get candidates from HNSW (more than topK to account for filtering) - const searchK = options.filter ? Math.min(topK * 3, this.embeddings.size) : topK; - const candidates = this.hnswIndex!.search(queryVector, searchK, this.config.hnsw?.efSearch); - - const results: SearchResult[] = []; - - for (const candidate of candidates) { - const embedding = this.embeddings.get(candidate.id); - if (!embedding) { - continue; - } - - // Apply filter if provided - if (options.filter && !options.filter(embedding.metadata)) { - continue; - } - - // Convert distance to similarity (HNSW uses distance = 1 - similarity) - const score = 1 - candidate.distance; - - if (score >= threshold) { - const result: SearchResult = { - id: embedding.id, - metadata: embedding.metadata, - score, - text: embedding.text, - }; - - if (options.includeVector) { - result.vector = embedding.vector; - } - - results.push(result); - - // Stop early if we have enough results - if (results.length >= topK) { - break; - } - } - } - - return results; - } - - /** - * Search using brute-force (exact nearest neighbor) - */ - private searchBruteForce( - queryVector: Float32Array, - topK: number, - threshold: number, - options: SearchOptions, - ): SearchResult[] { - const results: SearchResult[] = []; - - for (const embedding of this.embeddings.values()) { - // Apply filter if provided - if (options.filter && !options.filter(embedding.metadata)) { - continue; - } - - const score = cosineSimilarity(queryVector, embedding.vector); - - if (score >= threshold) { - const result: SearchResult = { - id: embedding.id, - metadata: embedding.metadata, - score, - text: embedding.text, - }; - - if (options.includeVector) { - result.vector = embedding.vector; - } - - results.push(result); - } - } - - // Sort by score (descending) - results.sort((a, b) => b.score - a.score); - - // Return top K - return results.slice(0, topK); - } - - /** - * Get a document by ID - */ - get(id: string): DocumentEmbedding | undefined { - return this.embeddings.get(id); - } - - /** - * Check if a document exists - */ - has(id: string): boolean { - return this.embeddings.has(id); - } - - /** - * Remove a document from the database - */ - remove(id: string): boolean { - const deleted = this.embeddings.delete(id); - - // Remove from HNSW index if enabled - if (deleted && this.hnswIndex) { - this.hnswIndex.remove(id); - } - - return deleted; - } - - /** - * Remove multiple documents - */ - removeMany(ids: string[]): number { - let removed = 0; - for (const id of ids) { - if (this.remove(id)) { - removed++; - } - } - return removed; - } - - /** - * Update document metadata without re-embedding - * Fast operation - only updates metadata, keeps existing embedding - * @throws Error if database is not initialized or document doesn't exist - */ - updateMetadata(id: string, metadata: T): void { - if (!this.isInitialized()) { - throw new VectoriaNotInitializedError('updating'); - } - - const existing = this.embeddings.get(id); - if (!existing) { - throw new DocumentNotFoundError(id); - } - - // Update metadata only, keep everything else the same - existing.metadata = metadata; - } - - /** - * Update document with smart re-embedding - * Only re-embeds if text changes (unless forceReembed is true) - * @throws Error if database is not initialized, document doesn't exist, or text is empty - */ - async update( - id: string, - updates: { text?: string; metadata?: T }, - options: { forceReembed?: boolean } = {}, - ): Promise { - if (!this.isInitialized()) { - throw new VectoriaNotInitializedError('updating'); - } - - const existing = this.embeddings.get(id); - if (!existing) { - throw new DocumentNotFoundError(id); - } - - // Check if text is being updated - const textChanged = updates.text !== undefined && updates.text !== existing.text; - const needsReembed = textChanged || options.forceReembed; - - // Validate new text if provided - if (updates.text !== undefined && (!updates.text || !updates.text.trim())) { - throw new DocumentValidationError('Document text cannot be empty or whitespace-only', id); - } - - // Check document size limit (DoS protection) - if (updates.text !== undefined && updates.text.length > this.config.maxDocumentSize) { - throw new DocumentValidationError( - `Document text exceeds maximum size. Maximum allowed: ${this.config.maxDocumentSize} characters`, - id, - ); - } - - // Update metadata if provided - if (updates.metadata !== undefined) { - existing.metadata = updates.metadata; - } - - // Update text and re-embed if needed - if (needsReembed && updates.text !== undefined) { - const newText = updates.text; - - // Remove from HNSW index (will re-add with new embedding) - if (this.hnswIndex) { - this.hnswIndex.remove(id); - } - - // Generate new embedding - const vector = await this.embeddingService.generateEmbedding(newText); - - // Update the embedding - existing.vector = vector; - existing.text = newText; - existing.createdAt = new Date(); - - // Re-add to HNSW index - if (this.hnswIndex) { - this.hnswIndex.insert(id, vector); - } - - return true; // Re-embedded - } - - return false; // No re-embedding needed - } - - /** - * Update multiple documents with smart re-embedding - * Only re-embeds documents where text changed - * @throws Error if database is not initialized, any document doesn't exist, or any text is empty - */ - async updateMany( - updates: Array<{ id: string; text?: string; metadata?: T }>, - options: { forceReembed?: boolean } = {}, - ): Promise<{ updated: number; reembedded: number }> { - if (!this.isInitialized()) { - throw new VectoriaNotInitializedError('updating'); - } - - // Check batch size limit (DoS protection) - if (updates.length > this.config.maxBatchSize) { - throw new DocumentValidationError( - `Batch size exceeds maximum allowed. Maximum: ${this.config.maxBatchSize}, provided: ${updates.length}`, - ); - } - - // Validate all documents exist and new texts are valid - for (const update of updates) { - if (!this.embeddings.has(update.id)) { - throw new DocumentNotFoundError(update.id); - } - if (update.text !== undefined && (!update.text || !update.text.trim())) { - throw new DocumentValidationError( - `Document with id "${update.id}" has empty or whitespace-only text`, - update.id, - ); - } - // Check document size limit (DoS protection) - if (update.text !== undefined && update.text.length > this.config.maxDocumentSize) { - throw new DocumentValidationError( - `Document with id "${update.id}" exceeds maximum size. Maximum allowed: ${this.config.maxDocumentSize} characters`, - update.id, - ); - } - } - - // Separate updates into metadata-only and re-embedding required - const metadataOnlyUpdates: typeof updates = []; - const reembedUpdates: typeof updates = []; - - for (const update of updates) { - const existing = this.embeddings.get(update.id)!; - const textChanged = update.text !== undefined && update.text !== existing.text; - const needsReembed = textChanged || options.forceReembed; - - if (needsReembed && update.text !== undefined) { - reembedUpdates.push(update); - } else { - metadataOnlyUpdates.push(update); - } - } - - // Update metadata-only updates (fast) - for (const update of metadataOnlyUpdates) { - const existing = this.embeddings.get(update.id)!; - if (update.metadata !== undefined) { - existing.metadata = update.metadata; - } - } - - // Batch re-embed updates that need it - if (reembedUpdates.length > 0) { - const texts = reembedUpdates.map((u) => u.text!); - const vectors = await this.embeddingService.generateEmbeddings(texts); - - for (let i = 0; i < reembedUpdates.length; i++) { - const update = reembedUpdates[i]; - const existing = this.embeddings.get(update.id)!; - const vector = vectors[i]; - - // Remove from HNSW if needed - if (this.hnswIndex) { - this.hnswIndex.remove(update.id); - } - - // Update embedding - existing.vector = vector; - existing.text = update.text!; - existing.createdAt = new Date(); - - // Update metadata if provided - if (update.metadata !== undefined) { - existing.metadata = update.metadata; - } - - // Re-add to HNSW - if (this.hnswIndex) { - this.hnswIndex.insert(update.id, vector); - } - } - } - - return { - updated: updates.length, - reembedded: reembedUpdates.length, - }; - } - - /** - * Clear all embeddings - */ - clear(): void { - this.embeddings.clear(); - - // Clear HNSW index if enabled - if (this.hnswIndex) { - this.hnswIndex.clear(); - } - } - - /** - * Get the number of embeddings - */ - size(): number { - return this.embeddings.size; - } - - /** - * Get all embedding IDs - */ - keys(): string[] { - return Array.from(this.embeddings.keys()); - } - - /** - * Get all embeddings - */ - values(): DocumentEmbedding[] { - return Array.from(this.embeddings.values()); - } - - /** - * Get database statistics - * @throws Error if database is not initialized - */ - getStats(): VectoriaStats { - if (!this.isInitialized()) { - throw new VectoriaNotInitializedError('getting stats'); - } - - // Estimate memory usage - const vectorBytes = this.embeddings.size * this.config.dimensions * 4; // Float32 - const metadataBytes = this.embeddings.size * 1024; // ~1KB per metadata (rough estimate) - - return { - totalEmbeddings: this.embeddings.size, - dimensions: this.config.dimensions, - estimatedMemoryBytes: vectorBytes + metadataBytes, - modelName: this.config.modelName, - }; - } - - /** - * Get documents by filter (without semantic search) - */ - filter(filterFn: (metadata: T) => boolean): DocumentEmbedding[] { - return Array.from(this.embeddings.values()).filter((embedding) => filterFn(embedding.metadata)); - } - - /** - * Get all documents - */ - getAll(): DocumentEmbedding[] { - return Array.from(this.embeddings.values()); - } - - /** - * Save embeddings to storage - * Call this method to persist embeddings manually - */ - async saveToStorage(): Promise { - if (!this.isInitialized()) { - throw new VectoriaNotInitializedError('saving'); - } - - const metadata = this.getStorageMetadata(); - const embeddings = Array.from(this.embeddings.values()).map((emb) => SerializationUtils.serializeEmbedding(emb)); - - const data: StoredData = { - metadata, - embeddings, - }; - - await this.storageAdapter.save(data); - } - - /** - * Load embeddings from storage - * Returns true if successfully loaded from cache - * @private - */ - private async loadFromStorage(): Promise { - try { - const metadata = this.getStorageMetadata(); - - // Check if valid cache exists - const hasValid = await this.storageAdapter.hasValidCache(metadata); - if (!hasValid) { - return false; - } - - // Load from storage - const data = await this.storageAdapter.load(); - if (!data || !data.embeddings || data.embeddings.length === 0) { - return false; - } - - // Clear existing data - this.clear(); - - // Restore embeddings - for (const serialized of data.embeddings) { - const embedding = SerializationUtils.deserializeEmbedding(serialized); - this.embeddings.set(embedding.id, embedding); - - // Add to HNSW index if enabled - if (this.hnswIndex) { - this.hnswIndex.insert(embedding.id, embedding.vector); - } - } - - return true; - } catch { - // Failed to load from cache, return false to continue with empty database - return false; - } - } - - /** - * Get storage metadata for the current state - * @private - */ - private getStorageMetadata(): StorageMetadata { - return { - version: this.config.version, - toolsHash: this.config.toolsHash, - timestamp: Date.now(), - modelName: this.config.modelName, - dimensions: this.config.dimensions, - documentCount: this.embeddings.size, - }; - } - - /** - * Clear storage cache - * This will delete all persisted embeddings - */ - async clearStorage(): Promise { - await this.storageAdapter.clear(); - } - - /** - * Close the database and storage adapter - * Call this when shutting down to cleanup resources - */ - async close(): Promise { - await this.storageAdapter.close(); - } -} diff --git a/libs/vectoriadb/tsconfig.json b/libs/vectoriadb/tsconfig.json deleted file mode 100644 index e2f9a4b..0000000 --- a/libs/vectoriadb/tsconfig.json +++ /dev/null @@ -1,28 +0,0 @@ -{ - "extends": "../../tsconfig.base.json", - "compilerOptions": { - "module": "NodeNext", - "target": "ES2022", - "moduleResolution": "NodeNext", - "forceConsistentCasingInFileNames": true, - "strict": true, - "emitDecoratorMetadata": true, - "experimentalDecorators": true, - "strictNullChecks": true, - "importHelpers": true, - "noImplicitOverride": true, - "noImplicitReturns": true, - "noFallthroughCasesInSwitch": true, - "noPropertyAccessFromIndexSignature": true - }, - "files": [], - "include": [], - "references": [ - { - "path": "./tsconfig.lib.json" - }, - { - "path": "./tsconfig.spec.json" - } - ] -} diff --git a/libs/vectoriadb/tsconfig.lib.json b/libs/vectoriadb/tsconfig.lib.json deleted file mode 100644 index 713d08a..0000000 --- a/libs/vectoriadb/tsconfig.lib.json +++ /dev/null @@ -1,18 +0,0 @@ -{ - "extends": "./tsconfig.json", - "compilerOptions": { - "outDir": "../../dist/out-tsc", - "declaration": true, - "emitDecoratorMetadata": true, - "experimentalDecorators": true, - "types": ["node"] - }, - "include": ["src/**/*.ts"], - "exclude": [ - "jest.config.ts", - "src/**/*.spec.ts", - "src/**/*.test.ts", - "src/**/__tests__/**", - "src/**/__test-utils__/**" - ] -} diff --git a/libs/vectoriadb/tsconfig.spec.json b/libs/vectoriadb/tsconfig.spec.json deleted file mode 100644 index 1275f14..0000000 --- a/libs/vectoriadb/tsconfig.spec.json +++ /dev/null @@ -1,10 +0,0 @@ -{ - "extends": "./tsconfig.json", - "compilerOptions": { - "outDir": "../../dist/out-tsc", - "module": "commonjs", - "moduleResolution": "node10", - "types": ["jest", "node"] - }, - "include": ["jest.config.ts", "src/**/*.test.ts", "src/**/*.spec.ts", "src/**/*.d.ts"] -} diff --git a/tsconfig.base.json b/tsconfig.base.json index ae92a6d..822dbd1 100644 --- a/tsconfig.base.json +++ b/tsconfig.base.json @@ -23,7 +23,6 @@ "paths": { "enclave-vm": ["libs/enclave-vm/src/index.ts"], "ast-guard": ["libs/ast-guard/src/index.ts"], - "vectoriadb": ["libs/vectoriadb/src/index.ts"], "@enclavejs/types": ["libs/enclavejs-types/src/index.ts"], "@enclavejs/stream": ["libs/enclavejs-stream/src/index.ts"], "@enclavejs/broker": ["libs/enclavejs-broker/src/index.ts"],