Target: Real-time visibility into Protocol Guide database performance
This guide sets up comprehensive database monitoring for:
- Vector search performance (HNSW index usage)
- Connection pool health
- Query plan analysis
- Index bloat detection
- Slow query logging
Estimated Setup Time: 4-8 hours
File: supabase/migrations/YYYYMMDD_add_slow_query_logging.sql
-- Slow query log table
CREATE TABLE IF NOT EXISTS slow_query_log (
id BIGSERIAL PRIMARY KEY,
query_text TEXT NOT NULL,
execution_time_ms INTEGER NOT NULL,
plan JSONB,
query_params JSONB,
user_id INTEGER,
agency_id INTEGER,
state_code CHAR(2),
created_at TIMESTAMPTZ DEFAULT NOW()
);
-- Index for time-based queries
CREATE INDEX idx_slow_query_log_created ON slow_query_log(created_at DESC);
-- Index for performance analysis
CREATE INDEX idx_slow_query_log_execution_time ON slow_query_log(execution_time_ms DESC);
-- Index for user analysis
CREATE INDEX idx_slow_query_log_user ON slow_query_log(user_id) WHERE user_id IS NOT NULL;
COMMENT ON TABLE slow_query_log IS
'Logs queries exceeding performance thresholds for analysis and optimization';
-- Function to log slow queries
CREATE OR REPLACE FUNCTION log_slow_query(
p_query_text TEXT,
p_execution_time_ms INTEGER,
p_plan JSONB DEFAULT NULL,
p_query_params JSONB DEFAULT NULL,
p_user_id INTEGER DEFAULT NULL,
p_agency_id INTEGER DEFAULT NULL,
p_state_code CHAR(2) DEFAULT NULL
)
RETURNS void AS $$
BEGIN
-- Only log queries exceeding threshold (200ms)
IF p_execution_time_ms >= 200 THEN
INSERT INTO slow_query_log (
query_text,
execution_time_ms,
plan,
query_params,
user_id,
agency_id,
state_code
) VALUES (
p_query_text,
p_execution_time_ms,
p_plan,
p_query_params,
p_user_id,
p_agency_id,
p_state_code
);
END IF;
END;
$$ LANGUAGE plpgsql;
-- Grant access
GRANT SELECT ON slow_query_log TO authenticated, service_role;
GRANT EXECUTE ON FUNCTION log_slow_query TO authenticated, service_role;
-- Retention policy: Auto-delete logs older than 30 days
CREATE OR REPLACE FUNCTION cleanup_slow_query_log()
RETURNS void AS $$
BEGIN
DELETE FROM slow_query_log
WHERE created_at < NOW() - INTERVAL '30 days';
END;
$$ LANGUAGE plpgsql;
-- Schedule cleanup (run daily via pg_cron or external scheduler)
-- EXAMPLE: SELECT cron.schedule('cleanup-slow-queries', '0 2 * * *', 'SELECT cleanup_slow_query_log()');File: server/_core/rag/latency.ts
import { createClient } from '@supabase/supabase-js';
const supabase = createClient(
process.env.SUPABASE_URL!,
process.env.SUPABASE_SERVICE_ROLE_KEY!
);
/**
* Log slow queries to database for analysis
*/
export async function logSlowQuery(params: {
queryText: string;
executionTimeMs: number;
plan?: object;
queryParams?: object;
userId?: number;
agencyId?: number;
stateCode?: string;
}) {
try {
await supabase.rpc('log_slow_query', {
p_query_text: params.queryText,
p_execution_time_ms: params.executionTimeMs,
p_plan: params.plan || null,
p_query_params: params.queryParams || null,
p_user_id: params.userId || null,
p_agency_id: params.agencyId || null,
p_state_code: params.stateCode || null,
});
} catch (error) {
// Don't fail the request if logging fails
console.error('[Monitoring] Failed to log slow query:', error);
}
}
/**
* Enhanced latency monitor with slow query logging
*/
export class LatencyMonitor {
// ... existing implementation
record(metric: string, durationMs: number, metadata?: object) {
// Existing monitoring code
this.metrics.push({ metric, durationMs, timestamp: Date.now() });
// Log slow vector searches
if (metric === 'vectorSearch' && durationMs >= 200) {
logSlowQuery({
queryText: 'vector_search',
executionTimeMs: durationMs,
queryParams: metadata,
}).catch(err => {
console.error('[Monitoring] Slow query logging failed:', err);
});
}
}
}File: server/_core/embeddings/search.ts
import { logSlowQuery } from '../rag/latency';
export async function semanticSearchProtocols(params: {
// ... existing params
}): Promise<SearchResult[]> {
const startTime = Date.now();
// ... existing search logic
const executionTime = Date.now() - startTime;
// Log slow queries
if (executionTime >= 200) {
await logSlowQuery({
queryText: 'semantic_search',
executionTimeMs: executionTime,
queryParams: {
query: params.query,
agencyId: params.agencyId,
stateCode: params.stateCode,
limit: params.limit,
threshold: params.threshold,
efSearch: efSearch,
},
agencyId: params.agencyId || undefined,
stateCode: params.stateCode || undefined,
});
}
return result;
}File: server/db/connection.ts
/**
* Get connection pool statistics for monitoring
*/
export function getPoolStats() {
if (!_pool) {
return {
totalCount: 0,
idleCount: 0,
waitingCount: 0,
status: 'not_initialized',
};
}
return {
totalCount: _pool.totalCount, // Total connections
idleCount: _pool.idleCount, // Available connections
waitingCount: _pool.waitingCount, // Queued requests
status: 'healthy',
};
}
/**
* Check if pool is exhausted (health check)
*/
export function isPoolExhausted(): boolean {
if (!_pool) return false;
const stats = getPoolStats();
const utilization = (stats.totalCount - stats.idleCount) / stats.totalCount;
// Alert if >80% utilization and requests are queued
return utilization > 0.8 && stats.waitingCount > 0;
}File: server/routers/admin.ts
import { getPoolStats, isPoolExhausted } from '../db/connection';
// Add to admin router
export const adminRouter = router({
// ... existing routes
/**
* Database performance metrics (admin only)
*/
dbMetrics: adminProcedure.query(async ({ ctx }) => {
const poolStats = getPoolStats();
const isExhausted = isPoolExhausted();
// Get slow query stats from last 24h
const { data: slowQueries } = await supabase
.from('slow_query_log')
.select('execution_time_ms')
.gte('created_at', new Date(Date.now() - 24 * 60 * 60 * 1000).toISOString())
.order('execution_time_ms', { ascending: false })
.limit(100);
const slowQueryCount = slowQueries?.length || 0;
const avgSlowQueryTime = slowQueries?.length
? slowQueries.reduce((sum, q) => sum + q.execution_time_ms, 0) / slowQueries.length
: 0;
return {
connectionPool: {
...poolStats,
isExhausted,
utilizationPercent: Math.round(
((poolStats.totalCount - poolStats.idleCount) / poolStats.totalCount) * 100
),
},
slowQueries: {
count24h: slowQueryCount,
avgExecutionTimeMs: Math.round(avgSlowQueryTime),
},
timestamp: new Date().toISOString(),
};
}),
});File: scripts/db-health-check.sql
-- ============================================================================
-- Protocol Guide Database Health Check Queries
-- Run these weekly to monitor database health
-- ============================================================================
-- 1. Index Usage Statistics
-- Shows which indexes are being used and how frequently
SELECT
schemaname,
tablename,
indexname,
idx_scan AS index_scans,
idx_tup_read AS tuples_read,
idx_tup_fetch AS tuples_fetched,
pg_size_pretty(pg_relation_size(indexname::regclass)) AS index_size
FROM pg_stat_user_indexes
WHERE schemaname = 'public'
AND tablename = 'manus_protocol_chunks'
ORDER BY idx_scan DESC;
-- Expected: idx_manus_chunks_embedding_hnsw should have high scan count
-- 2. Index Bloat Detection
-- Identifies indexes that may need REINDEX
SELECT
indexname,
pg_size_pretty(pg_relation_size(indexname::regclass)) AS current_size,
pg_size_pretty(pg_relation_size(tablename::regclass)) AS table_size,
ROUND(
100.0 * pg_relation_size(indexname::regclass) / NULLIF(pg_relation_size(tablename::regclass), 0),
2
) AS index_ratio_percent
FROM pg_indexes
WHERE schemaname = 'public'
AND tablename = 'manus_protocol_chunks'
ORDER BY pg_relation_size(indexname::regclass) DESC;
-- Expected: Vector index should be ~200MB, B-tree indexes <50MB each
-- 3. Table Bloat (Dead Tuples)
-- Shows autovacuum effectiveness
SELECT
schemaname,
relname AS table_name,
n_live_tup AS live_tuples,
n_dead_tup AS dead_tuples,
ROUND(100.0 * n_dead_tup / NULLIF(n_live_tup + n_dead_tup, 0), 2) AS dead_tuple_percent,
last_vacuum,
last_autovacuum,
last_analyze,
last_autoanalyze
FROM pg_stat_user_tables
WHERE schemaname = 'public'
AND relname = 'manus_protocol_chunks';
-- Expected: dead_tuple_percent < 10%
-- 4. Query Plan for Vector Search
-- Verify HNSW index is being used
EXPLAIN (ANALYZE, BUFFERS, VERBOSE)
SELECT
id,
protocol_title,
1 - (embedding <=> (SELECT embedding FROM manus_protocol_chunks LIMIT 1)) AS similarity
FROM manus_protocol_chunks
WHERE
agency_id = 123
AND embedding IS NOT NULL
ORDER BY embedding <=> (SELECT embedding FROM manus_protocol_chunks LIMIT 1)
LIMIT 10;
-- Expected plan:
-- -> Index Scan using idx_manus_chunks_embedding_hnsw
-- Buffers: shared hit=XXX
-- 5. Slow Queries Summary (Last 7 Days)
SELECT
query_text,
COUNT(*) AS occurrences,
ROUND(AVG(execution_time_ms)) AS avg_time_ms,
MAX(execution_time_ms) AS max_time_ms,
ROUND(PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY execution_time_ms)) AS p95_time_ms
FROM slow_query_log
WHERE created_at >= NOW() - INTERVAL '7 days'
GROUP BY query_text
ORDER BY occurrences DESC, avg_time_ms DESC
LIMIT 20;
-- 6. Connection Pool Status
-- (Run via application endpoint: GET /api/admin/db-metrics)File: scripts/db-health-check.ts
import { Pool } from 'pg';
import { createClient } from '@supabase/supabase-js';
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
const supabase = createClient(
process.env.SUPABASE_URL!,
process.env.SUPABASE_SERVICE_ROLE_KEY!
);
async function checkIndexHealth() {
const result = await pool.query(`
SELECT
indexname,
idx_scan,
pg_size_pretty(pg_relation_size(indexname::regclass)) AS size
FROM pg_stat_user_indexes
WHERE tablename = 'manus_protocol_chunks'
ORDER BY idx_scan DESC
`);
console.log('Index Usage:');
for (const row of result.rows) {
console.log(` ${row.indexname}: ${row.idx_scan} scans, ${row.size}`);
}
// Alert if HNSW index has 0 scans (broken index)
const hnswIndex = result.rows.find(r =>
r.indexname === 'idx_manus_chunks_embedding_hnsw'
);
if (hnswIndex && hnswIndex.idx_scan === 0) {
console.error('⚠️ HNSW index has 0 scans - may not be in use!');
return false;
}
return true;
}
async function checkTableBloat() {
const result = await pool.query(`
SELECT
n_live_tup,
n_dead_tup,
ROUND(100.0 * n_dead_tup / NULLIF(n_live_tup + n_dead_tup, 0), 2) AS dead_percent
FROM pg_stat_user_tables
WHERE relname = 'manus_protocol_chunks'
`);
const row = result.rows[0];
console.log(`\nTable Bloat:`);
console.log(` Live tuples: ${row.n_live_tup}`);
console.log(` Dead tuples: ${row.n_dead_tup} (${row.dead_percent}%)`);
// Alert if >10% dead tuples
if (row.dead_percent > 10) {
console.warn(`⚠️ High dead tuple percentage: ${row.dead_percent}% - consider VACUUM`);
return false;
}
return true;
}
async function checkSlowQueries() {
const { data, error } = await supabase
.from('slow_query_log')
.select('query_text, execution_time_ms')
.gte('created_at', new Date(Date.now() - 24 * 60 * 60 * 1000).toISOString())
.order('execution_time_ms', { ascending: false })
.limit(10);
if (error) {
console.error('Failed to fetch slow queries:', error);
return false;
}
console.log(`\nSlow Queries (Last 24h): ${data.length}`);
for (const query of data) {
console.log(` ${query.query_text}: ${query.execution_time_ms}ms`);
}
return true;
}
async function main() {
console.log('=== Protocol Guide Database Health Check ===\n');
const checks = [
{ name: 'Index Health', fn: checkIndexHealth },
{ name: 'Table Bloat', fn: checkTableBloat },
{ name: 'Slow Queries', fn: checkSlowQueries },
];
let allPassed = true;
for (const check of checks) {
try {
const passed = await check.fn();
if (!passed) allPassed = false;
} catch (error) {
console.error(`❌ ${check.name} failed:`, error);
allPassed = false;
}
}
await pool.end();
console.log('\n' + (allPassed ? '✅ All checks passed' : '⚠️ Some checks failed'));
process.exit(allPassed ? 0 : 1);
}
main();Usage:
# Run health check manually
pnpm tsx scripts/db-health-check.ts
# Schedule via cron (daily at 3am)
0 3 * * * cd /path/to/protocol-guide && pnpm tsx scripts/db-health-check.tsFile: app/admin/database.tsx
import { trpc } from '../../lib/trpc';
import { useQuery } from '@tanstack/react-query';
export default function DatabaseMonitoring() {
const { data: metrics, isLoading } = useQuery({
queryKey: ['admin', 'db-metrics'],
queryFn: async () => {
return await trpc.admin.dbMetrics.query();
},
refetchInterval: 30000, // Refresh every 30s
});
if (isLoading) return <div>Loading metrics...</div>;
const poolUtil = metrics?.connectionPool.utilizationPercent || 0;
const poolStatus = poolUtil > 80 ? 'critical' : poolUtil > 60 ? 'warning' : 'healthy';
return (
<div className="p-6">
<h1 className="text-2xl font-bold mb-6">Database Performance</h1>
{/* Connection Pool */}
<div className="bg-white rounded-lg shadow p-4 mb-6">
<h2 className="text-lg font-semibold mb-4">Connection Pool</h2>
<div className="grid grid-cols-4 gap-4">
<Metric
label="Total Connections"
value={metrics?.connectionPool.totalCount}
/>
<Metric
label="Idle Connections"
value={metrics?.connectionPool.idleCount}
/>
<Metric
label="Waiting Requests"
value={metrics?.connectionPool.waitingCount}
status={metrics?.connectionPool.waitingCount > 0 ? 'warning' : 'healthy'}
/>
<Metric
label="Utilization"
value={`${poolUtil}%`}
status={poolStatus}
/>
</div>
</div>
{/* Slow Queries */}
<div className="bg-white rounded-lg shadow p-4">
<h2 className="text-lg font-semibold mb-4">Slow Queries (24h)</h2>
<div className="grid grid-cols-2 gap-4">
<Metric
label="Count"
value={metrics?.slowQueries.count24h}
/>
<Metric
label="Avg Execution Time"
value={`${metrics?.slowQueries.avgExecutionTimeMs}ms`}
status={metrics?.slowQueries.avgExecutionTimeMs > 500 ? 'critical' : 'healthy'}
/>
</div>
</div>
</div>
);
}
function Metric({ label, value, status = 'healthy' }: {
label: string;
value: number | string | undefined;
status?: 'healthy' | 'warning' | 'critical';
}) {
const colors = {
healthy: 'text-green-600',
warning: 'text-yellow-600',
critical: 'text-red-600',
};
return (
<div>
<div className="text-sm text-gray-500">{label}</div>
<div className={`text-2xl font-bold ${colors[status]}`}>
{value ?? 'N/A'}
</div>
</div>
);
}Configure alerts for:
-
Connection Pool Exhausted
- Trigger:
waitingCount > 0ANDutilizationPercent > 80 - Action: Send alert to ops team
- Resolution: Scale connection pool or optimize queries
- Trigger:
-
Slow Queries Spike
- Trigger:
slowQueryCount > 100in last hour - Action: Log for investigation
- Resolution: Review slow query log, optimize queries
- Trigger:
-
Index Not Used
- Trigger:
idx_manus_chunks_embedding_hnswhas 0 scans - Action: Critical alert (search is broken)
- Resolution: Rebuild HNSW index
- Trigger:
-
High Table Bloat
- Trigger:
dead_tuple_percent > 15% - Action: Schedule VACUUM
- Resolution: Run
VACUUM ANALYZE manus_protocol_chunks
- Trigger:
Implementation Checklist:
- Create slow_query_log table
- Instrument search functions with logging
- Add connection pool metrics endpoint
- Create db-health-check.ts script
- Build admin dashboard component
- Schedule weekly health checks
- Configure alerts for critical metrics
Expected Outcomes:
- Real-time visibility into database performance
- Proactive detection of performance degradation
- Data-driven optimization decisions
- Faster incident response
Maintenance:
- Run health check script weekly
- Review slow query log monthly
- Monitor connection pool during traffic spikes
- VACUUM/ANALYZE after bulk ingestion
Setup Time: 4-8 hours Maintenance Time: 1-2 hours/month ROI: High (prevents outages, enables optimization)