Skip to content

Latest commit

 

History

History
679 lines (559 loc) · 17.4 KB

File metadata and controls

679 lines (559 loc) · 17.4 KB

Database Performance Monitoring Setup

Target: Real-time visibility into Protocol Guide database performance


Overview

This guide sets up comprehensive database monitoring for:

  • Vector search performance (HNSW index usage)
  • Connection pool health
  • Query plan analysis
  • Index bloat detection
  • Slow query logging

Estimated Setup Time: 4-8 hours


1. Slow Query Logging

Setup Database Table

File: supabase/migrations/YYYYMMDD_add_slow_query_logging.sql

-- Slow query log table
CREATE TABLE IF NOT EXISTS slow_query_log (
  id BIGSERIAL PRIMARY KEY,
  query_text TEXT NOT NULL,
  execution_time_ms INTEGER NOT NULL,
  plan JSONB,
  query_params JSONB,
  user_id INTEGER,
  agency_id INTEGER,
  state_code CHAR(2),
  created_at TIMESTAMPTZ DEFAULT NOW()
);

-- Index for time-based queries
CREATE INDEX idx_slow_query_log_created ON slow_query_log(created_at DESC);

-- Index for performance analysis
CREATE INDEX idx_slow_query_log_execution_time ON slow_query_log(execution_time_ms DESC);

-- Index for user analysis
CREATE INDEX idx_slow_query_log_user ON slow_query_log(user_id) WHERE user_id IS NOT NULL;

COMMENT ON TABLE slow_query_log IS
  'Logs queries exceeding performance thresholds for analysis and optimization';

-- Function to log slow queries
CREATE OR REPLACE FUNCTION log_slow_query(
  p_query_text TEXT,
  p_execution_time_ms INTEGER,
  p_plan JSONB DEFAULT NULL,
  p_query_params JSONB DEFAULT NULL,
  p_user_id INTEGER DEFAULT NULL,
  p_agency_id INTEGER DEFAULT NULL,
  p_state_code CHAR(2) DEFAULT NULL
)
RETURNS void AS $$
BEGIN
  -- Only log queries exceeding threshold (200ms)
  IF p_execution_time_ms >= 200 THEN
    INSERT INTO slow_query_log (
      query_text,
      execution_time_ms,
      plan,
      query_params,
      user_id,
      agency_id,
      state_code
    ) VALUES (
      p_query_text,
      p_execution_time_ms,
      p_plan,
      p_query_params,
      p_user_id,
      p_agency_id,
      p_state_code
    );
  END IF;
END;
$$ LANGUAGE plpgsql;

-- Grant access
GRANT SELECT ON slow_query_log TO authenticated, service_role;
GRANT EXECUTE ON FUNCTION log_slow_query TO authenticated, service_role;

-- Retention policy: Auto-delete logs older than 30 days
CREATE OR REPLACE FUNCTION cleanup_slow_query_log()
RETURNS void AS $$
BEGIN
  DELETE FROM slow_query_log
  WHERE created_at < NOW() - INTERVAL '30 days';
END;
$$ LANGUAGE plpgsql;

-- Schedule cleanup (run daily via pg_cron or external scheduler)
-- EXAMPLE: SELECT cron.schedule('cleanup-slow-queries', '0 2 * * *', 'SELECT cleanup_slow_query_log()');

Instrument Application Code

File: server/_core/rag/latency.ts

import { createClient } from '@supabase/supabase-js';

const supabase = createClient(
  process.env.SUPABASE_URL!,
  process.env.SUPABASE_SERVICE_ROLE_KEY!
);

/**
 * Log slow queries to database for analysis
 */
export async function logSlowQuery(params: {
  queryText: string;
  executionTimeMs: number;
  plan?: object;
  queryParams?: object;
  userId?: number;
  agencyId?: number;
  stateCode?: string;
}) {
  try {
    await supabase.rpc('log_slow_query', {
      p_query_text: params.queryText,
      p_execution_time_ms: params.executionTimeMs,
      p_plan: params.plan || null,
      p_query_params: params.queryParams || null,
      p_user_id: params.userId || null,
      p_agency_id: params.agencyId || null,
      p_state_code: params.stateCode || null,
    });
  } catch (error) {
    // Don't fail the request if logging fails
    console.error('[Monitoring] Failed to log slow query:', error);
  }
}

/**
 * Enhanced latency monitor with slow query logging
 */
export class LatencyMonitor {
  // ... existing implementation

  record(metric: string, durationMs: number, metadata?: object) {
    // Existing monitoring code
    this.metrics.push({ metric, durationMs, timestamp: Date.now() });

    // Log slow vector searches
    if (metric === 'vectorSearch' && durationMs >= 200) {
      logSlowQuery({
        queryText: 'vector_search',
        executionTimeMs: durationMs,
        queryParams: metadata,
      }).catch(err => {
        console.error('[Monitoring] Slow query logging failed:', err);
      });
    }
  }
}

File: server/_core/embeddings/search.ts

import { logSlowQuery } from '../rag/latency';

export async function semanticSearchProtocols(params: {
  // ... existing params
}): Promise<SearchResult[]> {
  const startTime = Date.now();

  // ... existing search logic

  const executionTime = Date.now() - startTime;

  // Log slow queries
  if (executionTime >= 200) {
    await logSlowQuery({
      queryText: 'semantic_search',
      executionTimeMs: executionTime,
      queryParams: {
        query: params.query,
        agencyId: params.agencyId,
        stateCode: params.stateCode,
        limit: params.limit,
        threshold: params.threshold,
        efSearch: efSearch,
      },
      agencyId: params.agencyId || undefined,
      stateCode: params.stateCode || undefined,
    });
  }

  return result;
}

2. Connection Pool Metrics

Add Pool Stats Endpoint

File: server/db/connection.ts

/**
 * Get connection pool statistics for monitoring
 */
export function getPoolStats() {
  if (!_pool) {
    return {
      totalCount: 0,
      idleCount: 0,
      waitingCount: 0,
      status: 'not_initialized',
    };
  }

  return {
    totalCount: _pool.totalCount,     // Total connections
    idleCount: _pool.idleCount,       // Available connections
    waitingCount: _pool.waitingCount, // Queued requests
    status: 'healthy',
  };
}

/**
 * Check if pool is exhausted (health check)
 */
export function isPoolExhausted(): boolean {
  if (!_pool) return false;

  const stats = getPoolStats();
  const utilization = (stats.totalCount - stats.idleCount) / stats.totalCount;

  // Alert if >80% utilization and requests are queued
  return utilization > 0.8 && stats.waitingCount > 0;
}

Add Metrics Endpoint

File: server/routers/admin.ts

import { getPoolStats, isPoolExhausted } from '../db/connection';

// Add to admin router
export const adminRouter = router({
  // ... existing routes

  /**
   * Database performance metrics (admin only)
   */
  dbMetrics: adminProcedure.query(async ({ ctx }) => {
    const poolStats = getPoolStats();
    const isExhausted = isPoolExhausted();

    // Get slow query stats from last 24h
    const { data: slowQueries } = await supabase
      .from('slow_query_log')
      .select('execution_time_ms')
      .gte('created_at', new Date(Date.now() - 24 * 60 * 60 * 1000).toISOString())
      .order('execution_time_ms', { ascending: false })
      .limit(100);

    const slowQueryCount = slowQueries?.length || 0;
    const avgSlowQueryTime = slowQueries?.length
      ? slowQueries.reduce((sum, q) => sum + q.execution_time_ms, 0) / slowQueries.length
      : 0;

    return {
      connectionPool: {
        ...poolStats,
        isExhausted,
        utilizationPercent: Math.round(
          ((poolStats.totalCount - poolStats.idleCount) / poolStats.totalCount) * 100
        ),
      },
      slowQueries: {
        count24h: slowQueryCount,
        avgExecutionTimeMs: Math.round(avgSlowQueryTime),
      },
      timestamp: new Date().toISOString(),
    };
  }),
});

3. Index Health Monitoring

SQL Queries for Index Analysis

File: scripts/db-health-check.sql

-- ============================================================================
-- Protocol Guide Database Health Check Queries
-- Run these weekly to monitor database health
-- ============================================================================

-- 1. Index Usage Statistics
-- Shows which indexes are being used and how frequently
SELECT
  schemaname,
  tablename,
  indexname,
  idx_scan AS index_scans,
  idx_tup_read AS tuples_read,
  idx_tup_fetch AS tuples_fetched,
  pg_size_pretty(pg_relation_size(indexname::regclass)) AS index_size
FROM pg_stat_user_indexes
WHERE schemaname = 'public'
  AND tablename = 'manus_protocol_chunks'
ORDER BY idx_scan DESC;

-- Expected: idx_manus_chunks_embedding_hnsw should have high scan count

-- 2. Index Bloat Detection
-- Identifies indexes that may need REINDEX
SELECT
  indexname,
  pg_size_pretty(pg_relation_size(indexname::regclass)) AS current_size,
  pg_size_pretty(pg_relation_size(tablename::regclass)) AS table_size,
  ROUND(
    100.0 * pg_relation_size(indexname::regclass) / NULLIF(pg_relation_size(tablename::regclass), 0),
    2
  ) AS index_ratio_percent
FROM pg_indexes
WHERE schemaname = 'public'
  AND tablename = 'manus_protocol_chunks'
ORDER BY pg_relation_size(indexname::regclass) DESC;

-- Expected: Vector index should be ~200MB, B-tree indexes <50MB each

-- 3. Table Bloat (Dead Tuples)
-- Shows autovacuum effectiveness
SELECT
  schemaname,
  relname AS table_name,
  n_live_tup AS live_tuples,
  n_dead_tup AS dead_tuples,
  ROUND(100.0 * n_dead_tup / NULLIF(n_live_tup + n_dead_tup, 0), 2) AS dead_tuple_percent,
  last_vacuum,
  last_autovacuum,
  last_analyze,
  last_autoanalyze
FROM pg_stat_user_tables
WHERE schemaname = 'public'
  AND relname = 'manus_protocol_chunks';

-- Expected: dead_tuple_percent < 10%

-- 4. Query Plan for Vector Search
-- Verify HNSW index is being used
EXPLAIN (ANALYZE, BUFFERS, VERBOSE)
SELECT
  id,
  protocol_title,
  1 - (embedding <=> (SELECT embedding FROM manus_protocol_chunks LIMIT 1)) AS similarity
FROM manus_protocol_chunks
WHERE
  agency_id = 123
  AND embedding IS NOT NULL
ORDER BY embedding <=> (SELECT embedding FROM manus_protocol_chunks LIMIT 1)
LIMIT 10;

-- Expected plan:
-- -> Index Scan using idx_manus_chunks_embedding_hnsw
-- Buffers: shared hit=XXX

-- 5. Slow Queries Summary (Last 7 Days)
SELECT
  query_text,
  COUNT(*) AS occurrences,
  ROUND(AVG(execution_time_ms)) AS avg_time_ms,
  MAX(execution_time_ms) AS max_time_ms,
  ROUND(PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY execution_time_ms)) AS p95_time_ms
FROM slow_query_log
WHERE created_at >= NOW() - INTERVAL '7 days'
GROUP BY query_text
ORDER BY occurrences DESC, avg_time_ms DESC
LIMIT 20;

-- 6. Connection Pool Status
-- (Run via application endpoint: GET /api/admin/db-metrics)

Automated Health Check Script

File: scripts/db-health-check.ts

import { Pool } from 'pg';
import { createClient } from '@supabase/supabase-js';

const pool = new Pool({ connectionString: process.env.DATABASE_URL });
const supabase = createClient(
  process.env.SUPABASE_URL!,
  process.env.SUPABASE_SERVICE_ROLE_KEY!
);

async function checkIndexHealth() {
  const result = await pool.query(`
    SELECT
      indexname,
      idx_scan,
      pg_size_pretty(pg_relation_size(indexname::regclass)) AS size
    FROM pg_stat_user_indexes
    WHERE tablename = 'manus_protocol_chunks'
    ORDER BY idx_scan DESC
  `);

  console.log('Index Usage:');
  for (const row of result.rows) {
    console.log(`  ${row.indexname}: ${row.idx_scan} scans, ${row.size}`);
  }

  // Alert if HNSW index has 0 scans (broken index)
  const hnswIndex = result.rows.find(r =>
    r.indexname === 'idx_manus_chunks_embedding_hnsw'
  );

  if (hnswIndex && hnswIndex.idx_scan === 0) {
    console.error('⚠️  HNSW index has 0 scans - may not be in use!');
    return false;
  }

  return true;
}

async function checkTableBloat() {
  const result = await pool.query(`
    SELECT
      n_live_tup,
      n_dead_tup,
      ROUND(100.0 * n_dead_tup / NULLIF(n_live_tup + n_dead_tup, 0), 2) AS dead_percent
    FROM pg_stat_user_tables
    WHERE relname = 'manus_protocol_chunks'
  `);

  const row = result.rows[0];
  console.log(`\nTable Bloat:`);
  console.log(`  Live tuples: ${row.n_live_tup}`);
  console.log(`  Dead tuples: ${row.n_dead_tup} (${row.dead_percent}%)`);

  // Alert if >10% dead tuples
  if (row.dead_percent > 10) {
    console.warn(`⚠️  High dead tuple percentage: ${row.dead_percent}% - consider VACUUM`);
    return false;
  }

  return true;
}

async function checkSlowQueries() {
  const { data, error } = await supabase
    .from('slow_query_log')
    .select('query_text, execution_time_ms')
    .gte('created_at', new Date(Date.now() - 24 * 60 * 60 * 1000).toISOString())
    .order('execution_time_ms', { ascending: false })
    .limit(10);

  if (error) {
    console.error('Failed to fetch slow queries:', error);
    return false;
  }

  console.log(`\nSlow Queries (Last 24h): ${data.length}`);
  for (const query of data) {
    console.log(`  ${query.query_text}: ${query.execution_time_ms}ms`);
  }

  return true;
}

async function main() {
  console.log('=== Protocol Guide Database Health Check ===\n');

  const checks = [
    { name: 'Index Health', fn: checkIndexHealth },
    { name: 'Table Bloat', fn: checkTableBloat },
    { name: 'Slow Queries', fn: checkSlowQueries },
  ];

  let allPassed = true;

  for (const check of checks) {
    try {
      const passed = await check.fn();
      if (!passed) allPassed = false;
    } catch (error) {
      console.error(`❌ ${check.name} failed:`, error);
      allPassed = false;
    }
  }

  await pool.end();

  console.log('\n' + (allPassed ? '✅ All checks passed' : '⚠️  Some checks failed'));
  process.exit(allPassed ? 0 : 1);
}

main();

Usage:

# Run health check manually
pnpm tsx scripts/db-health-check.ts

# Schedule via cron (daily at 3am)
0 3 * * * cd /path/to/protocol-guide && pnpm tsx scripts/db-health-check.ts

4. Monitoring Dashboard

Admin Dashboard Component

File: app/admin/database.tsx

import { trpc } from '../../lib/trpc';
import { useQuery } from '@tanstack/react-query';

export default function DatabaseMonitoring() {
  const { data: metrics, isLoading } = useQuery({
    queryKey: ['admin', 'db-metrics'],
    queryFn: async () => {
      return await trpc.admin.dbMetrics.query();
    },
    refetchInterval: 30000, // Refresh every 30s
  });

  if (isLoading) return <div>Loading metrics...</div>;

  const poolUtil = metrics?.connectionPool.utilizationPercent || 0;
  const poolStatus = poolUtil > 80 ? 'critical' : poolUtil > 60 ? 'warning' : 'healthy';

  return (
    <div className="p-6">
      <h1 className="text-2xl font-bold mb-6">Database Performance</h1>

      {/* Connection Pool */}
      <div className="bg-white rounded-lg shadow p-4 mb-6">
        <h2 className="text-lg font-semibold mb-4">Connection Pool</h2>
        <div className="grid grid-cols-4 gap-4">
          <Metric
            label="Total Connections"
            value={metrics?.connectionPool.totalCount}
          />
          <Metric
            label="Idle Connections"
            value={metrics?.connectionPool.idleCount}
          />
          <Metric
            label="Waiting Requests"
            value={metrics?.connectionPool.waitingCount}
            status={metrics?.connectionPool.waitingCount > 0 ? 'warning' : 'healthy'}
          />
          <Metric
            label="Utilization"
            value={`${poolUtil}%`}
            status={poolStatus}
          />
        </div>
      </div>

      {/* Slow Queries */}
      <div className="bg-white rounded-lg shadow p-4">
        <h2 className="text-lg font-semibold mb-4">Slow Queries (24h)</h2>
        <div className="grid grid-cols-2 gap-4">
          <Metric
            label="Count"
            value={metrics?.slowQueries.count24h}
          />
          <Metric
            label="Avg Execution Time"
            value={`${metrics?.slowQueries.avgExecutionTimeMs}ms`}
            status={metrics?.slowQueries.avgExecutionTimeMs > 500 ? 'critical' : 'healthy'}
          />
        </div>
      </div>
    </div>
  );
}

function Metric({ label, value, status = 'healthy' }: {
  label: string;
  value: number | string | undefined;
  status?: 'healthy' | 'warning' | 'critical';
}) {
  const colors = {
    healthy: 'text-green-600',
    warning: 'text-yellow-600',
    critical: 'text-red-600',
  };

  return (
    <div>
      <div className="text-sm text-gray-500">{label}</div>
      <div className={`text-2xl font-bold ${colors[status]}`}>
        {value ?? 'N/A'}
      </div>
    </div>
  );
}

5. Alerts & Notifications

Alert Rules

Configure alerts for:

  1. Connection Pool Exhausted

    • Trigger: waitingCount > 0 AND utilizationPercent > 80
    • Action: Send alert to ops team
    • Resolution: Scale connection pool or optimize queries
  2. Slow Queries Spike

    • Trigger: slowQueryCount > 100 in last hour
    • Action: Log for investigation
    • Resolution: Review slow query log, optimize queries
  3. Index Not Used

    • Trigger: idx_manus_chunks_embedding_hnsw has 0 scans
    • Action: Critical alert (search is broken)
    • Resolution: Rebuild HNSW index
  4. High Table Bloat

    • Trigger: dead_tuple_percent > 15%
    • Action: Schedule VACUUM
    • Resolution: Run VACUUM ANALYZE manus_protocol_chunks

Summary

Implementation Checklist:

  • Create slow_query_log table
  • Instrument search functions with logging
  • Add connection pool metrics endpoint
  • Create db-health-check.ts script
  • Build admin dashboard component
  • Schedule weekly health checks
  • Configure alerts for critical metrics

Expected Outcomes:

  • Real-time visibility into database performance
  • Proactive detection of performance degradation
  • Data-driven optimization decisions
  • Faster incident response

Maintenance:

  • Run health check script weekly
  • Review slow query log monthly
  • Monitor connection pool during traffic spikes
  • VACUUM/ANALYZE after bulk ingestion

Setup Time: 4-8 hours Maintenance Time: 1-2 hours/month ROI: High (prevents outages, enables optimization)