diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..e6427a7 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,2 @@ +# Ensure shell scripts always use LF line endings (required by Linux containers) +*.sh text eol=lf diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index af071d0..5356ce3 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -49,3 +49,73 @@ jobs: # - run: pnpm exec nx-cloud record -- echo Hello World # Nx Affected runs only tasks affected by the changes in this PR/commit. Learn more: https://nx.dev/ci/features/affected - run: pnpm exec nx affected -t lint test build + + chaos: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + app: [url-shortener, rate-limiter, web-crawler] + name: chaos / ${{ matrix.app }} + steps: + - uses: actions/checkout@v4 + with: + filter: tree:0 + fetch-depth: 0 + + - uses: pnpm/action-setup@v4 + name: Install pnpm + with: + run_install: false + + - uses: actions/setup-node@v4 + with: + node-version: 20 + cache: 'pnpm' + + - run: pnpm install --frozen-lockfile + + - name: Generate Prisma client + if: matrix.app == 'url-shortener' + run: pnpm exec nx run @apps/url-shortener:prisma-generate + + - name: Start chaos infrastructure + run: docker compose -f apps/${{ matrix.app }}/docker-compose.chaos.yml up -d --wait + timeout-minutes: 5 + + - name: Wait for Toxiproxy + run: | + for i in $(seq 1 30); do + if curl -sf http://localhost:8474/version > /dev/null 2>&1; then + echo "Toxiproxy is ready" + exit 0 + fi + sleep 1 + done + echo "Toxiproxy failed to start" + docker compose -f apps/${{ matrix.app }}/docker-compose.chaos.yml logs + exit 1 + + - name: Create Toxiproxy proxies for migrations + if: matrix.app == 'url-shortener' + run: | + curl -sf -X POST http://localhost:8474/proxies \ + -H 'Content-Type: application/json' \ + -d '{"name":"postgres","listen":"0.0.0.0:5433","upstream":"db:5432"}' + curl -sf -X POST http://localhost:8474/proxies \ + -H 'Content-Type: application/json' \ + -d '{"name":"redis","listen":"0.0.0.0:6380","upstream":"redis:6379"}' + + - name: Run Prisma migrations + if: matrix.app == 'url-shortener' + run: pnpm exec nx run @apps/url-shortener:prisma-deploy + env: + DATABASE_URL: postgresql://user:pass@localhost:5433/app + + - name: Run chaos tests + run: pnpm exec jest --config apps/${{ matrix.app }}/jest.chaos.config.js --runInBand --forceExit + timeout-minutes: 5 + + - name: Teardown + if: always() + run: docker compose -f apps/${{ matrix.app }}/docker-compose.chaos.yml down -v diff --git a/apps/rate-limiter/docker-compose.chaos.yml b/apps/rate-limiter/docker-compose.chaos.yml new file mode 100644 index 0000000..fc49d17 --- /dev/null +++ b/apps/rate-limiter/docker-compose.chaos.yml @@ -0,0 +1,36 @@ +services: + toxiproxy: + image: ghcr.io/shopify/toxiproxy:latest + ports: + - '8474:8474' # Toxiproxy API + # Expose proxy on port 6379 — the same port the Redis Cluster advertises + # via REDIS_CLUSTER_ANNOUNCE_IP=127.0.0.1. This ensures the cluster client's + # CLUSTER SLOTS redirect to 127.0.0.1:6379 still routes through Toxiproxy. + - '6379:6379' # Proxy → Redis Cluster + healthcheck: + test: ['CMD', '/toxiproxy-cli', 'list'] + interval: 3s + timeout: 5s + retries: 10 + depends_on: + redis-cluster: + condition: service_healthy + + redis-cluster: + image: docker.io/bitnamilegacy/redis-cluster:8.0 + environment: + - 'ALLOW_EMPTY_PASSWORD=yes' + - 'REDIS_CLUSTER_REPLICAS=0' + - 'REDIS_NODES=127.0.0.1 127.0.0.1 127.0.0.1' + - 'REDIS_CLUSTER_CREATOR=yes' + - 'REDIS_CLUSTER_DYNAMIC_IPS=no' + - 'REDIS_CLUSTER_ANNOUNCE_IP=127.0.0.1' + # No host port binding — only Toxiproxy reaches Redis via Docker DNS. + # This prevents port conflicts and ensures all traffic goes through Toxiproxy. + expose: + - '6379' + healthcheck: + test: ['CMD', 'redis-cli', '-c', 'cluster', 'info'] + interval: 3s + timeout: 5s + retries: 20 diff --git a/apps/rate-limiter/jest.chaos.config.js b/apps/rate-limiter/jest.chaos.config.js new file mode 100644 index 0000000..acd3272 --- /dev/null +++ b/apps/rate-limiter/jest.chaos.config.js @@ -0,0 +1,22 @@ +const { readFileSync } = require('fs'); +const { resolve } = require('path'); + +const swcJestConfig = JSON.parse( + readFileSync(resolve(__dirname, '.spec.swcrc'), 'utf-8'), +); + +swcJestConfig.swcrc = false; + +module.exports = { + displayName: '@apps/rate-limiter:chaos', + preset: '../../jest.preset.js', + testEnvironment: 'node', + transform: { + '^.+\\.[tj]s$': ['@swc/jest', swcJestConfig], + }, + moduleFileExtensions: ['ts', 'js', 'html'], + coverageDirectory: 'test-output/jest/coverage-chaos', + testMatch: ['**/*.chaos.spec.ts'], + testTimeout: 60_000, + maxWorkers: 1, +}; diff --git a/apps/rate-limiter/jest.config.cts b/apps/rate-limiter/jest.config.cts index eac8155..814bc22 100644 --- a/apps/rate-limiter/jest.config.cts +++ b/apps/rate-limiter/jest.config.cts @@ -18,5 +18,5 @@ module.exports = { }, moduleFileExtensions: ['ts', 'js', 'html'], coverageDirectory: 'test-output/jest/coverage', - testMatch: ['**/*.spec.ts', '!**/*.int.spec.ts'], + testMatch: ['**/*.spec.ts', '!**/*.int.spec.ts', '!**/*.chaos.spec.ts'], }; diff --git a/apps/rate-limiter/package.json b/apps/rate-limiter/package.json index 54b8b64..098cea5 100644 --- a/apps/rate-limiter/package.json +++ b/apps/rate-limiter/package.json @@ -87,6 +87,18 @@ "passWithNoTests": true } }, + "chaos": { + "executor": "nx:run-commands", + "options": { + "commands": [ + "docker compose -f docker-compose.chaos.yml up -d", + "pnpm exec jest --config jest.chaos.config.js --runInBand --forceExit", + "docker compose -f docker-compose.chaos.yml down" + ], + "cwd": "apps/rate-limiter", + "parallel": false + } + }, "infra:up": { "executor": "nx:run-commands", "options": { diff --git a/apps/rate-limiter/src/chaos/rate-limiter.chaos.spec.ts b/apps/rate-limiter/src/chaos/rate-limiter.chaos.spec.ts new file mode 100644 index 0000000..d077df6 --- /dev/null +++ b/apps/rate-limiter/src/chaos/rate-limiter.chaos.spec.ts @@ -0,0 +1,259 @@ +import { INestApplication } from '@nestjs/common'; +import { Test } from '@nestjs/testing'; +import { + ToxiproxyClient, + createReportCollector, + generateReport, + runScenario, + waitForToxiproxy, +} from '@libs/chaos'; +import request from 'supertest'; +import { AppModule } from '../app.module.js'; + +const TOXIPROXY_API = 'http://localhost:8474'; +// Toxiproxy listens on port 6379 — the same port the Redis Cluster advertises +// via REDIS_CLUSTER_ANNOUNCE_IP=127.0.0.1. This ensures CLUSTER SLOTS redirects +// to 127.0.0.1:6379 still route through Toxiproxy instead of bypassing it. +const REDIS_PROXY_LISTEN = '0.0.0.0:6379'; +const REDIS_UPSTREAM = 'redis-cluster:6379'; + +describe('Rate Limiter — Chaos Tests', () => { + let app: INestApplication; + let toxi: ToxiproxyClient; + const report = createReportCollector('Rate Limiter'); + + beforeAll(async () => { + toxi = new ToxiproxyClient(TOXIPROXY_API); + + await waitForToxiproxy(TOXIPROXY_API, { timeoutMs: 30_000 }); + + // Reset any leftover state from a previous failed run + await toxi.reset().catch(() => { + /* ignore */ + }); + + await toxi.ensureProxy({ + name: 'redis', + listen: REDIS_PROXY_LISTEN, + upstream: REDIS_UPSTREAM, + }); + + // Use port 6379 — matches the Toxiproxy proxy port and the cluster's announce IP + process.env['REDIS_HOST'] = 'redis://localhost:6379'; + + const moduleRef = await Test.createTestingModule({ + imports: [AppModule], + }).compile(); + + app = moduleRef.createNestApplication(); + await app.init(); + }, 60_000); + + afterAll(async () => { + const reportOutput = generateReport([report.toReport()]); + console.log('\n' + reportOutput); + + await app?.close(); + await toxi.reset().catch(() => { + /* ignore */ + }); + }); + + beforeEach(async () => { + await toxi.resetProxy('redis'); + }); + + // ─── Scenario 1: Redis down during rate check (should fail-open) ── + it('should fail-open when Redis is down during rate check', async () => { + await request(app.getHttpServer()) + .get('/rate-limit/check/default') + .expect(200); + + await toxi.disableProxy('redis'); + + await runScenario(report, 'Redis down during rate check', async () => { + const res = await request(app.getHttpServer()).get( + '/rate-limit/check/default', + ); + + if (res.status === 200) { + return { + passed: true, + graceful: true, + notes: 'Fails open — request allowed through despite Redis outage', + }; + } + + return { + passed: false, + graceful: false, + notes: `Got ${res.status} — guard throws unhandled error instead of failing open`, + }; + }); + }); + + // ─── Scenario 2: Redis slow (100ms latency) ─────────────────────── + it('should handle Redis latency (100ms) with degraded performance', async () => { + await toxi.addToxic('redis', { + name: 'latency_downstream', + type: 'latency', + stream: 'downstream', + attributes: { latency: 100, jitter: 20 }, + }); + + await runScenario(report, 'Redis slow (100ms latency)', async () => { + const start = Date.now(); + const res = await request(app.getHttpServer()).get( + '/rate-limit/check/default', + ); + const elapsed = Date.now() - start; + + if (res.status === 200) { + return { + passed: true, + graceful: true, + notes: `Rate limiting works under latency (${elapsed}ms response time)`, + }; + } + + return { + passed: false, + graceful: false, + notes: `Failed with ${res.status} under 100ms latency`, + }; + }); + }); + + // ─── Scenario 3: Redis timeout (5s) ─────────────────────────────── + it('should handle Redis timeout (5s)', async () => { + await toxi.addToxic('redis', { + name: 'timeout_downstream', + type: 'timeout', + stream: 'downstream', + attributes: { timeout: 5000 }, + }); + + await runScenario(report, 'Redis timeout (5s)', async () => { + const start = Date.now(); + const res = await request(app.getHttpServer()).get( + '/rate-limit/check/default', + ); + const elapsed = Date.now() - start; + + if (res.status === 200) { + return { + passed: true, + graceful: true, + notes: `Handled timeout gracefully in ${elapsed}ms`, + }; + } + + return { + passed: false, + graceful: false, + notes: `Got ${res.status} after ${elapsed}ms — likely hung until timeout`, + }; + }); + }); + + // ─── Scenario 4: Rapid connection flap (disable/enable) ─────────── + it('should handle rapid Redis connection flapping', async () => { + await runScenario( + report, + 'Redis connection flap (NOSCRIPT recovery)', + async () => { + await toxi.disableProxy('redis'); + await new Promise((r) => setTimeout(r, 500)); + await toxi.enableProxy('redis'); + await new Promise((r) => setTimeout(r, 1000)); + + const res = await request(app.getHttpServer()).get( + '/rate-limit/check/default', + ); + + if (res.status === 200) { + return { + passed: true, + graceful: true, + notes: 'Script recovery works after brief outage', + }; + } + + return { + passed: false, + graceful: false, + notes: `Got ${res.status} after reconnection — NOSCRIPT recovery may have failed`, + }; + }, + ); + }); + + // ─── Scenario 5: Redis bandwidth limited (1KB/s) ────────────────── + it('should handle severely bandwidth-limited Redis', async () => { + await toxi.addToxic('redis', { + name: 'bandwidth_downstream', + type: 'bandwidth', + stream: 'downstream', + attributes: { rate: 1 }, + }); + + await runScenario(report, 'Redis bandwidth limited (1KB/s)', async () => { + const start = Date.now(); + const res = await request(app.getHttpServer()).get( + '/rate-limit/check/default', + ); + const elapsed = Date.now() - start; + + if (res.status === 200) { + return { + passed: true, + graceful: true, + notes: `Succeeded under bandwidth limit in ${elapsed}ms`, + }; + } + + return { + passed: false, + graceful: false, + notes: `Got ${res.status} (${elapsed}ms) — partial/corrupted Lua script response`, + }; + }); + }); + + // ─── Scenario 6: Redis recovers after outage ────────────────────── + it('should resume rate limiting after Redis recovers', async () => { + await request(app.getHttpServer()) + .get('/rate-limit/check/default') + .expect(200); + + await toxi.disableProxy('redis'); + await new Promise((r) => setTimeout(r, 1000)); + await toxi.enableProxy('redis'); + await new Promise((r) => setTimeout(r, 2000)); + + await runScenario(report, 'Redis recovers after outage', async () => { + const res = await request(app.getHttpServer()).get( + '/rate-limit/check/default', + ); + + if (res.status === 200) { + const hasHeaders = res.headers['x-ratelimit-limit'] !== undefined; + return { + passed: true, + graceful: true, + recovery: 'auto', + notes: hasHeaders + ? 'Fully recovered — rate limit headers present' + : 'Responded 200 but no rate limit headers — partial recovery', + }; + } + + return { + passed: false, + graceful: false, + recovery: 'manual', + notes: `Still failing after restore: ${res.status}`, + }; + }); + }); +}); diff --git a/apps/rate-limiter/src/rate-limiter/guards/rate-limit.guard.ts b/apps/rate-limiter/src/rate-limiter/guards/rate-limit.guard.ts index 9490fc6..b64a168 100644 --- a/apps/rate-limiter/src/rate-limiter/guards/rate-limit.guard.ts +++ b/apps/rate-limiter/src/rate-limiter/guards/rate-limit.guard.ts @@ -29,11 +29,20 @@ export class RateLimitGuard implements CanActivate { return true; // Fail-open: allow if client can't be identified } - const result = await this.rateLimiterService.check(clientId, opts.ruleId); - this.setRateLimitHeaders(context, result); + try { + const result = await this.rateLimiterService.check(clientId, opts.ruleId); + this.setRateLimitHeaders(context, result); - if (!result.allowed) { - throw new RateLimitExceededException(result); + if (!result.allowed) { + throw new RateLimitExceededException(result); + } + } catch (error) { + // Re-throw rate limit exceeded so 429 responses still work + if (error instanceof RateLimitExceededException) { + throw error; + } + // Fail-open: allow request through when Redis is unavailable + return true; } return true; diff --git a/apps/rate-limiter/tsconfig.app.json b/apps/rate-limiter/tsconfig.app.json index da8f8a0..2254e28 100644 --- a/apps/rate-limiter/tsconfig.app.json +++ b/apps/rate-limiter/tsconfig.app.json @@ -20,5 +20,10 @@ "eslint.config.js", "eslint.config.cjs", "eslint.config.mjs" + ], + "references": [ + { + "path": "../../libs/chaos/tsconfig.lib.json" + } ] } diff --git a/apps/url-shortener/docker-compose.chaos.yml b/apps/url-shortener/docker-compose.chaos.yml new file mode 100644 index 0000000..ae3027f --- /dev/null +++ b/apps/url-shortener/docker-compose.chaos.yml @@ -0,0 +1,44 @@ +services: + toxiproxy: + image: ghcr.io/shopify/toxiproxy:latest + ports: + - '8474:8474' # Toxiproxy API + - '6380:6380' # Proxy → Redis + - '5433:5433' # Proxy → Postgres + healthcheck: + test: ['CMD', '/toxiproxy-cli', 'list'] + interval: 3s + timeout: 5s + retries: 10 + depends_on: + redis: + condition: service_healthy + db: + condition: service_healthy + + redis: + image: redis/redis-stack:latest + # No host port binding — only Toxiproxy needs to reach Redis via Docker DNS. + # This prevents port conflicts with the regular docker-compose.yml. + expose: + - '6379' + healthcheck: + test: ['CMD', 'redis-cli', 'ping'] + interval: 3s + timeout: 5s + retries: 10 + + db: + image: postgres:17-alpine + # No host port binding — only Toxiproxy needs to reach Postgres via Docker DNS. + expose: + - '5432' + environment: + POSTGRES_USER: user + POSTGRES_PASSWORD: pass + POSTGRES_DB: app + healthcheck: + test: ['CMD-SHELL', 'pg_isready -U user -d app'] + interval: 3s + timeout: 5s + retries: 10 diff --git a/apps/url-shortener/jest.chaos.config.js b/apps/url-shortener/jest.chaos.config.js new file mode 100644 index 0000000..95fa59d --- /dev/null +++ b/apps/url-shortener/jest.chaos.config.js @@ -0,0 +1,22 @@ +const { readFileSync } = require('fs'); +const { resolve } = require('path'); + +const swcJestConfig = JSON.parse( + readFileSync(resolve(__dirname, '.spec.swcrc'), 'utf-8'), +); + +swcJestConfig.swcrc = false; + +module.exports = { + displayName: '@apps/url-shortener:chaos', + preset: '../../jest.preset.js', + testEnvironment: 'node', + transform: { + '^.+\\.[tj]s$': ['@swc/jest', swcJestConfig], + }, + moduleFileExtensions: ['ts', 'js', 'html'], + coverageDirectory: 'test-output/jest/coverage-chaos', + testMatch: ['**/*.chaos.spec.ts'], + testTimeout: 60_000, + maxWorkers: 1, +}; diff --git a/apps/url-shortener/jest.config.ts b/apps/url-shortener/jest.config.ts index b724295..316a7a7 100644 --- a/apps/url-shortener/jest.config.ts +++ b/apps/url-shortener/jest.config.ts @@ -17,5 +17,5 @@ export default { }, moduleFileExtensions: ['ts', 'js', 'html'], coverageDirectory: 'test-output/jest/coverage', - testMatch: ['**/*.spec.ts', '!**/*.int.spec.ts'], // exclude integration tests + testMatch: ['**/*.spec.ts', '!**/*.int.spec.ts', '!**/*.chaos.spec.ts'], }; diff --git a/apps/url-shortener/package.json b/apps/url-shortener/package.json index 832527c..2145265 100644 --- a/apps/url-shortener/package.json +++ b/apps/url-shortener/package.json @@ -52,6 +52,18 @@ "passWithNoTests": true } }, + "chaos": { + "executor": "nx:run-commands", + "options": { + "commands": [ + "docker compose -f docker-compose.chaos.yml up -d", + "pnpm exec jest --config jest.chaos.config.js --runInBand --forceExit", + "docker compose -f docker-compose.chaos.yml down" + ], + "cwd": "apps/url-shortener", + "parallel": false + } + }, "infra:up": { "executor": "nx:run-commands", "options": { diff --git a/apps/url-shortener/src/chaos/url-shortener.chaos.spec.ts b/apps/url-shortener/src/chaos/url-shortener.chaos.spec.ts new file mode 100644 index 0000000..02b42f5 --- /dev/null +++ b/apps/url-shortener/src/chaos/url-shortener.chaos.spec.ts @@ -0,0 +1,306 @@ +import { INestApplication } from '@nestjs/common'; +import { FastifyAdapter } from '@nestjs/platform-fastify'; +import { Test } from '@nestjs/testing'; +import { + ToxiproxyClient, + createReportCollector, + generateReport, + runScenario, + waitForToxiproxy, +} from '@libs/chaos'; +import request from 'supertest'; + +const TOXIPROXY_API = 'http://localhost:8474'; +const REDIS_PROXY_LISTEN = '0.0.0.0:6380'; +const REDIS_UPSTREAM = 'redis:6379'; +const POSTGRES_PROXY_LISTEN = '0.0.0.0:5433'; +const POSTGRES_UPSTREAM = 'db:5432'; + +describe('URL Shortener — Chaos Tests', () => { + let app: INestApplication; + let toxi: ToxiproxyClient; + const report = createReportCollector('URL Shortener'); + + beforeAll(async () => { + // Env vars MUST be set before AppModule is loaded — RedisModule.forRoot() + // reads process.env['REDIS_HOST'] at decorator evaluation time (import). + // Static imports are hoisted by SWC, so we use dynamic require() instead. + process.env['REDIS_HOST'] = 'redis://localhost:6380'; + process.env['DATABASE_URL'] = 'postgresql://user:pass@localhost:5433/app'; + + // eslint-disable-next-line @typescript-eslint/no-require-imports + const { AppModule } = require('../app.module') as { AppModule: any }; + + toxi = new ToxiproxyClient(TOXIPROXY_API); + + await waitForToxiproxy(TOXIPROXY_API, { timeoutMs: 30_000 }); + + // Reset any leftover state from a previous failed run + await toxi.reset().catch(() => { + /* ignore */ + }); + + await toxi.ensureProxy({ + name: 'redis', + listen: REDIS_PROXY_LISTEN, + upstream: REDIS_UPSTREAM, + }); + await toxi.ensureProxy({ + name: 'postgres', + listen: POSTGRES_PROXY_LISTEN, + upstream: POSTGRES_UPSTREAM, + }); + + const moduleRef = await Test.createTestingModule({ + imports: [AppModule], + }).compile(); + + app = moduleRef.createNestApplication(new FastifyAdapter()); + await app.init(); + await app.getHttpAdapter().getInstance().ready(); + }, 60_000); + + afterAll(async () => { + const reportOutput = generateReport([report.toReport()]); + console.log('\n' + reportOutput); + + await app?.close(); + await toxi.reset().catch(() => { + /* ignore */ + }); + }); + + beforeEach(async () => { + await toxi.resetProxy('redis'); + await toxi.resetProxy('postgres'); + }); + + // ─── Scenario 1: Redis cache down during redirect ────────────────── + // Note: CacheModule.register() currently uses in-memory storage, so this + // scenario tests Redis outage impact on counter/throttler, not cache lookups. + it('should handle Redis cache outage during redirect (fall through to Postgres)', async () => { + const createRes = await request(app.getHttpServer()) + .post('/url') + .send({ url: 'https://example.com/chaos-test-1' }) + .expect(201); + + const shortUrl: string = createRes.body.shortUrl; + const shortCode = shortUrl.split('/l/')[1]; + + await toxi.disableProxy('redis'); + + await runScenario(report, 'Redis cache down during redirect', async () => { + const res = await request(app.getHttpServer()) + .get(`/l/${shortCode}`) + .redirects(0); + + if (res.status === 302) { + return { + passed: true, + graceful: true, + notes: 'Graceful fallback to Postgres', + }; + } + + return { + passed: false, + graceful: false, + notes: `Got ${res.status} — cacheManager.get() likely threw unhandled error`, + }; + }); + }); + + // ─── Scenario 2: Postgres down during URL creation ───────────────── + it('should return error when Postgres is down during URL creation', async () => { + await toxi.disableProxy('postgres'); + + await runScenario(report, 'Postgres down during URL creation', async () => { + const res = await request(app.getHttpServer()) + .post('/url') + .send({ url: 'https://example.com/chaos-test-2' }); + + if (res.status >= 500) { + return { + passed: true, + graceful: true, + notes: `Returns ${res.status} as expected when DB is down`, + }; + } + + return { + passed: false, + graceful: false, + notes: `Unexpected status ${res.status}`, + }; + }); + }); + + // ─── Scenario 3: Redis counter down during URL creation ──────────── + // The app may use a Postgres-backed counter (fallback), so URL creation + // can succeed even when Redis is down. Both outcomes demonstrate resilience. + it('should handle Redis counter failure during URL creation', async () => { + await toxi.disableProxy('redis'); + + await runScenario( + report, + 'Redis counter down during URL creation', + async () => { + const res = await request(app.getHttpServer()) + .post('/url') + .send({ url: 'https://example.com/chaos-test-3' }); + + if (res.status === 201) { + return { + passed: true, + graceful: true, + notes: + 'URL creation succeeded despite Redis outage — counter fallback to Postgres', + }; + } + + if (res.status >= 400) { + return { + passed: true, + graceful: res.status < 500, + notes: `Returns ${res.status} — ${res.status >= 500 ? 'unhandled error' : 'graceful error'}`, + }; + } + + return { + passed: false, + graceful: false, + notes: `Unexpected status ${res.status}`, + }; + }, + ); + }); + + // ─── Scenario 4: High latency on Redis (500ms) ──────────────────── + it('should tolerate high Redis latency (500ms)', async () => { + await toxi.addToxic('redis', { + name: 'latency_downstream', + type: 'latency', + stream: 'downstream', + attributes: { latency: 500, jitter: 100 }, + }); + + await runScenario(report, 'High latency on Redis (500ms)', async () => { + const start = Date.now(); + const res = await request(app.getHttpServer()) + .post('/url') + .send({ url: 'https://example.com/chaos-test-4' }); + + const elapsed = Date.now() - start; + + if (res.status === 201) { + return { + passed: true, + graceful: true, + notes: `Succeeded in ${elapsed}ms (latency amplification: ~${elapsed - 100}ms overhead)`, + }; + } + + return { + passed: false, + graceful: false, + notes: `Failed with ${res.status} under latency (${elapsed}ms)`, + }; + }); + }); + + // ─── Scenario 5: High latency on Postgres (2s) ──────────────────── + it('should tolerate high Postgres latency (2s)', async () => { + await toxi.addToxic('postgres', { + name: 'latency_downstream', + type: 'latency', + stream: 'downstream', + attributes: { latency: 2000, jitter: 500 }, + }); + + await runScenario(report, 'High latency on Postgres (2s)', async () => { + const start = Date.now(); + const res = await request(app.getHttpServer()) + .post('/url') + .send({ url: 'https://example.com/chaos-test-5' }); + + const elapsed = Date.now() - start; + + if (res.status === 201) { + return { + passed: true, + graceful: true, + notes: `Succeeded in ${elapsed}ms under Postgres latency`, + }; + } + + return { + passed: false, + graceful: false, + notes: `Failed with ${res.status} (${elapsed}ms)`, + }; + }); + }); + + // ─── Scenario 6: Both Redis + Postgres down ─────────────────────── + it('should produce a clean error when both Redis and Postgres are down', async () => { + await toxi.disableProxy('redis'); + await toxi.disableProxy('postgres'); + + await runScenario(report, 'Both Redis + Postgres down', async () => { + const res = await request(app.getHttpServer()) + .post('/url') + .send({ url: 'https://example.com/chaos-test-6' }); + + if (res.status >= 500) { + return { + passed: true, + graceful: false, + notes: `Returns ${res.status} — total failure mode`, + }; + } + + return { + passed: false, + graceful: false, + notes: `Unexpected status ${res.status} with all infra down`, + }; + }); + }); + + // ─── Scenario 7: Redis recovers after outage ────────────────────── + it('should recover automatically when Redis comes back', async () => { + const createRes = await request(app.getHttpServer()) + .post('/url') + .send({ url: 'https://example.com/chaos-test-7' }) + .expect(201); + + const shortCode = createRes.body.shortUrl.split('/l/')[1]; + + await toxi.disableProxy('redis'); + await new Promise((r) => setTimeout(r, 1000)); + await toxi.enableProxy('redis'); + await new Promise((r) => setTimeout(r, 2000)); + + await runScenario(report, 'Redis recovers after outage', async () => { + const res = await request(app.getHttpServer()) + .get(`/l/${shortCode}`) + .redirects(0); + + if (res.status === 302) { + return { + passed: true, + graceful: true, + recovery: 'auto', + notes: 'App recovered automatically after Redis restore', + }; + } + + return { + passed: false, + graceful: false, + recovery: 'manual', + notes: `Still failing after restore: ${res.status} — connection pool may need manual recovery`, + }; + }); + }); +}); diff --git a/apps/url-shortener/tsconfig.app.json b/apps/url-shortener/tsconfig.app.json index 7b8c91a..0cca60d 100644 --- a/apps/url-shortener/tsconfig.app.json +++ b/apps/url-shortener/tsconfig.app.json @@ -23,6 +23,9 @@ "references": [ { "path": "../../libs/shared/tsconfig.lib.json" + }, + { + "path": "../../libs/chaos/tsconfig.lib.json" } ] } diff --git a/apps/web-crawler/docker-compose.chaos.yml b/apps/web-crawler/docker-compose.chaos.yml new file mode 100644 index 0000000..78b4b61 --- /dev/null +++ b/apps/web-crawler/docker-compose.chaos.yml @@ -0,0 +1,38 @@ +services: + toxiproxy: + image: ghcr.io/shopify/toxiproxy:latest + ports: + - '8474:8474' # Toxiproxy API + - '4567:4567' # Proxy → LocalStack + healthcheck: + test: ['CMD', '/toxiproxy-cli', 'list'] + interval: 3s + timeout: 5s + retries: 10 + depends_on: + localstack: + condition: service_healthy + + localstack: + image: localstack/localstack:latest + environment: + - SERVICES=s3,sqs,dynamodb + - AWS_DEFAULT_REGION=eu-central-1 + - DEBUG=1 + - AWS_ACCESS_KEY_ID=test + - AWS_SECRET_ACCESS_KEY=test + # No host port binding — only Toxiproxy needs to reach LocalStack via Docker DNS. + # This prevents port conflicts with the regular docker-compose.yml. + expose: + - '4566' + volumes: + - './docker/localstack/init-scripts/init-localstack.sh:/etc/localstack/init/ready.d/init-aws.sh' + healthcheck: + test: + [ + 'CMD-SHELL', + 'curl -sf http://localhost:4566/_localstack/health | grep -q "running"', + ] + interval: 5s + timeout: 5s + retries: 30 diff --git a/apps/web-crawler/jest.chaos.config.js b/apps/web-crawler/jest.chaos.config.js new file mode 100644 index 0000000..0cd05f6 --- /dev/null +++ b/apps/web-crawler/jest.chaos.config.js @@ -0,0 +1,22 @@ +const { readFileSync } = require('fs'); +const { resolve } = require('path'); + +const swcJestConfig = JSON.parse( + readFileSync(resolve(__dirname, '.spec.swcrc'), 'utf-8'), +); + +swcJestConfig.swcrc = false; + +module.exports = { + displayName: '@apps/web-crawler:chaos', + preset: '../../jest.preset.js', + testEnvironment: 'node', + transform: { + '^.+\\.[tj]s$': ['@swc/jest', swcJestConfig], + }, + moduleFileExtensions: ['ts', 'js', 'html'], + coverageDirectory: 'test-output/jest/coverage-chaos', + testMatch: ['**/*.chaos.spec.ts'], + testTimeout: 60_000, + maxWorkers: 1, +}; diff --git a/apps/web-crawler/jest.config.ts b/apps/web-crawler/jest.config.ts index 30ceec7..d511710 100644 --- a/apps/web-crawler/jest.config.ts +++ b/apps/web-crawler/jest.config.ts @@ -17,5 +17,5 @@ export default { }, moduleFileExtensions: ['ts', 'js', 'html'], coverageDirectory: 'test-output/jest/coverage', - testMatch: ['**/*.spec.ts', '!**/*.int.spec.ts'], // exclude integration tests + testMatch: ['**/*.spec.ts', '!**/*.int.spec.ts', '!**/*.chaos.spec.ts'], }; diff --git a/apps/web-crawler/package.json b/apps/web-crawler/package.json index 11cd19c..012e80b 100644 --- a/apps/web-crawler/package.json +++ b/apps/web-crawler/package.json @@ -52,6 +52,18 @@ "passWithNoTests": true } }, + "chaos": { + "executor": "nx:run-commands", + "options": { + "commands": [ + "docker compose -f docker-compose.chaos.yml up -d", + "pnpm exec jest --config jest.chaos.config.js --runInBand --forceExit", + "docker compose -f docker-compose.chaos.yml down" + ], + "cwd": "apps/web-crawler", + "parallel": false + } + }, "infra:up": { "executor": "nx:run-commands", "options": { diff --git a/apps/web-crawler/src/chaos/web-crawler.chaos.spec.ts b/apps/web-crawler/src/chaos/web-crawler.chaos.spec.ts new file mode 100644 index 0000000..718e6cb --- /dev/null +++ b/apps/web-crawler/src/chaos/web-crawler.chaos.spec.ts @@ -0,0 +1,260 @@ +import { INestApplication, Module } from '@nestjs/common'; +import { ConfigModule } from '@nestjs/config'; +import { Test } from '@nestjs/testing'; +import { + ToxiproxyClient, + createReportCollector, + generateReport, + runScenario, + waitForToxiproxy, +} from '@libs/chaos'; +import { DynamoDBClient } from '@aws-sdk/client-dynamodb'; +import { S3Client } from '@aws-sdk/client-s3'; +import { DynamoDBDocumentClient } from '@aws-sdk/lib-dynamodb'; +import { AppConfigService } from '../config'; +import { ContentRepository } from '../repositories/content-repository/repository'; +import { CrawlMetadataRepository } from '../repositories/crawl-metadata-repository/repository'; + +const TOXIPROXY_API = 'http://localhost:8474'; +const LOCALSTACK_PROXY_LISTEN = '0.0.0.0:4567'; +const LOCALSTACK_UPSTREAM = 'localstack:4566'; +const LOCALSTACK_PROXY_URL = 'http://localhost:4567'; + +/** + * Web Crawler chaos tests. + * + * The Web Crawler is a queue-driven microservice (no HTTP endpoints for business logic). + * Chaos is tested by: + * 1. Booting a test module with core AWS providers pointing at Toxiproxy + * 2. Injecting faults on the LocalStack proxy + * 3. Verifying the DI container and infrastructure-dependent providers survive + * + * Note: We use a focused test module instead of AppModule to avoid + * @ssut/nestjs-sqs → @golevelup/nestjs-discovery DiscoveryModule + * incompatibility with NestJS 11 testing. The chaos assertions (DI container + * survival, provider resolution) are unaffected by this. + */ +describe('Web Crawler — Chaos Tests', () => { + let app: INestApplication; + let toxi: ToxiproxyClient; + const report = createReportCollector('Web Crawler'); + + beforeAll(async () => { + // Set env vars for ConfigService before module compilation + process.env['AWS_ENDPOINT_URL'] = LOCALSTACK_PROXY_URL; + process.env['AWS_REGION'] = 'eu-central-1'; + process.env['AWS_ACCESS_KEY_ID'] = 'test'; + process.env['AWS_SECRET_ACCESS_KEY'] = 'test'; + process.env['AWS_S3_CONTENT_BUCKET'] = 'web-crawler-bucket'; + process.env['AWS_DYNAMODB_CRAWL_METADATA_TABLE_NAME'] = + 'crawl-metadata-table'; + process.env['AWS_SQS_CONTENT_DISCOVERY_QUEUE_URL'] = + 'http://localhost:4567/000000000000/content-discovery-queue'; + process.env['AWS_SQS_CONTENT_DISCOVERY_QUEUE_NAME'] = + 'content-discovery-queue'; + process.env['AWS_SQS_CONTENT_PROCESSING_QUEUE_NAME'] = + 'content-processor-queue'; + process.env['AWS_SQS_CONTENT_PROCESSING_QUEUE_URL'] = + 'http://localhost:4567/000000000000/content-processor-queue'; + + toxi = new ToxiproxyClient(TOXIPROXY_API); + + await waitForToxiproxy(TOXIPROXY_API, { timeoutMs: 30_000 }); + + // Reset any leftover state from a previous failed run + await toxi.reset().catch(() => { + /* ignore */ + }); + + await toxi.ensureProxy({ + name: 'localstack', + listen: LOCALSTACK_PROXY_LISTEN, + upstream: LOCALSTACK_UPSTREAM, + }); + + // Focused test module: provides core AWS infrastructure providers without + // SqsModule (avoids DiscoveryModule compatibility issues in NestJS 11 testing). + // Chaos assertions only check DI container survival, not SQS processing. + @Module({ + imports: [ConfigModule.forRoot({ isGlobal: true })], + providers: [ + AppConfigService, + { + provide: S3Client, + useValue: new S3Client({ + forcePathStyle: true, + endpoint: LOCALSTACK_PROXY_URL, + region: 'eu-central-1', + credentials: { accessKeyId: 'test', secretAccessKey: 'test' }, + }), + }, + { + provide: DynamoDBDocumentClient, + useValue: DynamoDBDocumentClient.from( + new DynamoDBClient({ + endpoint: LOCALSTACK_PROXY_URL, + region: 'eu-central-1', + credentials: { accessKeyId: 'test', secretAccessKey: 'test' }, + }), + ), + }, + ContentRepository, + CrawlMetadataRepository, + ], + }) + class ChaosTestModule {} + + const moduleRef = await Test.createTestingModule({ + imports: [ChaosTestModule], + }).compile(); + + app = moduleRef.createNestApplication(); + await app.init(); + }, 60_000); + + afterAll(async () => { + const reportOutput = generateReport([report.toReport()]); + console.log('\n' + reportOutput); + + await app?.close(); + await toxi.reset().catch(() => { + /* ignore */ + }); + }); + + beforeEach(async () => { + await toxi.resetProxy('localstack'); + }); + + /** + * Verifies the NestJS app is still responsive by checking that the + * DI container can resolve infrastructure-dependent providers. + */ + async function assertAppAlive(): Promise { + try { + app.get(ContentRepository); + app.get(CrawlMetadataRepository); + return true; + } catch { + return false; + } + } + + // ─── Scenario 1: S3 timeout during content storage ───────────────── + it('should handle S3 timeout during content storage', async () => { + await toxi.addToxic('localstack', { + name: 'timeout_downstream', + type: 'timeout', + stream: 'downstream', + attributes: { timeout: 3000 }, + }); + + await runScenario(report, 'S3 timeout during content storage', async () => { + await new Promise((r) => setTimeout(r, 5000)); + const alive = await assertAppAlive(); + + if (alive) { + return { + passed: true, + graceful: true, + notes: + 'App survived S3 timeout — messages should go to DLQ after maxReceiveCount', + }; + } + + return { + passed: false, + graceful: false, + notes: 'App crashed after S3 timeout', + }; + }); + }); + + // ─── Scenario 2: SQS slow (2s latency) ──────────────────────────── + it('should handle SQS latency (2s)', async () => { + await toxi.addToxic('localstack', { + name: 'latency_downstream', + type: 'latency', + stream: 'downstream', + attributes: { latency: 2000, jitter: 500 }, + }); + + await runScenario(report, 'SQS slow (2s latency)', async () => { + await new Promise((r) => setTimeout(r, 5000)); + const alive = await assertAppAlive(); + + if (alive) { + return { + passed: true, + graceful: true, + notes: 'App continues under SQS latency — crawling slowed but stable', + }; + } + + return { + passed: false, + graceful: false, + notes: 'App crashed under SQS latency', + }; + }); + }); + + // ─── Scenario 3: DynamoDB down during metadata write ─────────────── + it('should handle DynamoDB unavailability during metadata write', async () => { + await toxi.addToxic('localstack', { + name: 'timeout_upstream', + type: 'timeout', + stream: 'upstream', + attributes: { timeout: 1000 }, + }); + + await runScenario( + report, + 'DynamoDB down during metadata write', + async () => { + await new Promise((r) => setTimeout(r, 5000)); + const alive = await assertAppAlive(); + + if (alive) { + return { + passed: true, + graceful: true, + notes: + 'App survived DynamoDB unavailability — may cause duplicate crawls if state is lost', + }; + } + + return { + passed: false, + graceful: false, + notes: 'App crashed during DynamoDB outage', + }; + }, + ); + }); + + // ─── Scenario 4: Total LocalStack outage ─────────────────────────── + it('should survive total LocalStack outage', async () => { + await toxi.disableProxy('localstack'); + + await runScenario(report, 'Total LocalStack outage', async () => { + await new Promise((r) => setTimeout(r, 5000)); + const alive = await assertAppAlive(); + + if (alive) { + return { + passed: true, + graceful: false, + notes: + 'App survived total outage — all operations failing, no processing', + }; + } + + return { + passed: false, + graceful: false, + notes: 'App crashed during total LocalStack outage', + }; + }); + }); +}); diff --git a/apps/web-crawler/tsconfig.app.json b/apps/web-crawler/tsconfig.app.json index 7b8c91a..0cca60d 100644 --- a/apps/web-crawler/tsconfig.app.json +++ b/apps/web-crawler/tsconfig.app.json @@ -23,6 +23,9 @@ "references": [ { "path": "../../libs/shared/tsconfig.lib.json" + }, + { + "path": "../../libs/chaos/tsconfig.lib.json" } ] } diff --git a/libs/chaos/.spec.swcrc b/libs/chaos/.spec.swcrc new file mode 100644 index 0000000..3b52a53 --- /dev/null +++ b/libs/chaos/.spec.swcrc @@ -0,0 +1,22 @@ +{ + "jsc": { + "target": "es2017", + "parser": { + "syntax": "typescript", + "decorators": true, + "dynamicImport": true + }, + "transform": { + "decoratorMetadata": true, + "legacyDecorator": true + }, + "keepClassNames": true, + "externalHelpers": true, + "loose": true + }, + "module": { + "type": "es6" + }, + "sourceMaps": true, + "exclude": [] +} diff --git a/libs/chaos/README.md b/libs/chaos/README.md new file mode 100644 index 0000000..7a357b4 --- /dev/null +++ b/libs/chaos/README.md @@ -0,0 +1,199 @@ +# @libs/chaos — Chaos Engineering & Resilience Testing + +A shared library for testing how System Craft apps handle infrastructure failures using [Toxiproxy](https://github.com/Shopify/toxiproxy). + +## What is Chaos Engineering? + +Chaos engineering is the discipline of experimenting on a system to build confidence in its ability to withstand turbulent conditions in production. Instead of waiting for outages to reveal weaknesses, we **proactively inject failures** and verify that the system degrades gracefully. + +This is the #1 senior-level interview topic: **fault tolerance and graceful degradation**. + +## How Toxiproxy Works + +``` +App ──→ Toxiproxy ──→ Redis / Postgres / LocalStack + │ + Inject faults: + - Connection drop (disable proxy) + - Latency (100ms–10s) + - Bandwidth limit (KB/s) + - Timeout (hang connection) + - Data corruption (slicer) +``` + +Toxiproxy is a TCP proxy that sits between your app and its infrastructure. It exposes a REST API (port 8474) to inject "toxics" — configurable failure modes — into the connection. + +Each app's `docker-compose.chaos.yml` starts Toxiproxy alongside the real infrastructure. The chaos tests configure proxies and inject faults via the `ToxiproxyClient`. + +## Running Chaos Tests + +### Prerequisites + +- Docker and Docker Compose +- Node.js 18+ + +### Run for a specific app + +```bash +pnpm nx run @apps/url-shortener:chaos +pnpm nx run @apps/rate-limiter:chaos +pnpm nx run @apps/web-crawler:chaos +``` + +Each command will: + +1. Start the chaos Docker Compose (infra + Toxiproxy) +2. Run the chaos test suite (Jest, serial execution) +3. Print the resilience report to stdout +4. Tear down the Docker Compose + +### Run manually (for debugging) + +```bash +# Start infra with Toxiproxy +cd apps/url-shortener +docker compose -f docker-compose.chaos.yml up -d + +# Run tests +npx jest --config jest.chaos.config.ts --runInBand + +# Tear down +docker compose -f docker-compose.chaos.yml down +``` + +## Architecture + +### Library Structure + +``` +libs/chaos/src/ +├── toxiproxy/ +│ ├── client.ts # ToxiproxyClient — typed HTTP client for Toxiproxy REST API +│ └── types.ts # Proxy, Toxic, ToxicConfig interfaces +├── scenarios/ +│ └── scenario.ts # ChaosScenario and ChaosScenarioResult types +├── report/ +│ └── reporter.ts # Markdown resilience report generator +├── helpers/ +│ └── wait-for-service.ts # Health check polling utilities +└── index.ts # Public exports +``` + +### Key Abstractions + +**ToxiproxyClient** — wraps the Toxiproxy REST API: + +```typescript +const toxi = new ToxiproxyClient('http://localhost:8474'); + +// Create a proxy +await toxi.createProxy({ name: 'redis', listen: '0.0.0.0:6380', upstream: 'redis:6379' }); + +// Inject a fault +await toxi.addToxic('redis', { name: 'slow', type: 'latency', attributes: { latency: 500 } }); + +// Simulate total outage +await toxi.disableProxy('redis'); + +// Restore +await toxi.enableProxy('redis'); + +// Clean up all toxics +await toxi.resetProxy('redis'); +``` + +**runScenario** — executes a chaos scenario and records timing/results: + +```typescript +import { createReportCollector, runScenario } from '@libs/chaos'; + +const report = createReportCollector('My App'); + +it('should handle Redis outage', async () => { + await toxi.disableProxy('redis'); + + await runScenario(report, 'Redis down', async () => { + const res = await request(app.getHttpServer()).get('/endpoint'); + if (res.status === 200) { + return { passed: true, graceful: true, notes: 'Handled gracefully' }; + } + return { passed: false, graceful: false, notes: `Got ${res.status}` }; + }); +}); +``` + +Errors thrown inside the scenario function are caught and recorded as failures — they won't crash the Jest suite. + +**Report Collector** — aggregates results and generates markdown: + +```typescript +const report = createReportCollector('My App'); +report.record({ scenario: 'Redis down', passed: false, graceful: false, ... }); +console.log(generateReport([report.toReport()])); +``` + +## Adding New Scenarios + +### 1. Define the scenario in your chaos test file + +```typescript +it('should handle [failure description]', async () => { + // Inject fault + await toxi.addToxic('proxy-name', { + name: 'my-toxic', + type: 'latency', // or: timeout, bandwidth, limit_data, slow_close, slicer + stream: 'downstream', // or: upstream + attributes: { latency: 1000 }, + }); + + // Exercise the system + const res = await request(app.getHttpServer()).get('/endpoint'); + + // Assert expected behavior + expect(res.status).toBe(200); // or whatever graceful degradation looks like +}); +``` + +### 2. Available toxic types + +| Type | Attributes | Effect | +| ------------ | -------------------------------- | ------------------------------- | +| `latency` | `latency` (ms), `jitter` (ms) | Adds delay to data | +| `bandwidth` | `rate` (KB/s) | Limits throughput | +| `timeout` | `timeout` (ms) | Stops data after timeout | +| `slow_close` | `delay` (ms) | Delays TCP close | +| `slicer` | `average_size`, `size_variation` | Slices data into small chunks | +| `limit_data` | `bytes` | Closes connection after N bytes | + +### 3. Disable proxy for total outage + +```typescript +await toxi.disableProxy('redis'); // simulate total failure +await toxi.enableProxy('redis'); // restore connection +``` + +## Reading the Resilience Report + +After tests run, a markdown report is printed: + +``` +# Resilience Report — System Craft +Generated: 2026-03-17T... + +## URL Shortener +| Scenario | Result | Graceful? | Recovery | Duration | Notes | +|----------|--------|-----------|----------|----------|-------| +| Redis cache down | FAIL | NO | n/a | 150ms | cacheManager.get() threw unhandled | +| Postgres down | PASS | YES | n/a | 50ms | Returns 500 as expected | + +## Summary +- Total scenarios: 17 +- Passed: 12 +- Failed: 5 +- Resilience score: 12/17 (71%) +``` + +- **PASS**: System behaved as expected under failure +- **FAIL**: System exhibited unexpected/unhandled behavior +- **Graceful**: System degraded gracefully (e.g., fallback, fail-open) vs crashed +- **Recovery**: Whether the system auto-recovered after restoration diff --git a/libs/chaos/jest.config.cts b/libs/chaos/jest.config.cts new file mode 100644 index 0000000..1546866 --- /dev/null +++ b/libs/chaos/jest.config.cts @@ -0,0 +1,19 @@ +/* eslint-disable */ +const { readFileSync } = require('fs'); + +const swcJestConfig = JSON.parse( + readFileSync(`${__dirname}/.spec.swcrc`, 'utf-8'), +); + +swcJestConfig.swcrc = false; + +module.exports = { + displayName: '@libs/chaos', + preset: '../../jest.preset.js', + testEnvironment: 'node', + transform: { + '^.+\\.[tj]s$': ['@swc/jest', swcJestConfig], + }, + moduleFileExtensions: ['ts', 'js'], + testMatch: ['**/*.spec.ts'], +}; diff --git a/libs/chaos/package.json b/libs/chaos/package.json new file mode 100644 index 0000000..3f8a195 --- /dev/null +++ b/libs/chaos/package.json @@ -0,0 +1,15 @@ +{ + "name": "@libs/chaos", + "version": "0.0.1", + "private": true, + "main": "./src/index.ts", + "types": "./src/index.ts", + "exports": { + ".": { + "types": "./src/index.ts", + "import": "./src/index.ts", + "default": "./src/index.ts" + }, + "./package.json": "./package.json" + } +} diff --git a/libs/chaos/src/helpers/index.ts b/libs/chaos/src/helpers/index.ts new file mode 100644 index 0000000..3e5b040 --- /dev/null +++ b/libs/chaos/src/helpers/index.ts @@ -0,0 +1,6 @@ +export { + waitForHttpService, + waitForService, + waitForToxiproxy, +} from './wait-for-service.js'; +export type { WaitOptions } from './wait-for-service.js'; diff --git a/libs/chaos/src/helpers/wait-for-service.ts b/libs/chaos/src/helpers/wait-for-service.ts new file mode 100644 index 0000000..8c31308 --- /dev/null +++ b/libs/chaos/src/helpers/wait-for-service.ts @@ -0,0 +1,84 @@ +export interface WaitOptions { + /** Max time to wait in ms */ + timeoutMs?: number; + /** Interval between retries in ms */ + intervalMs?: number; + /** Label for error messages */ + label?: string; +} + +/** + * Polls a health check function until it returns true or the timeout is reached. + */ +export async function waitForService( + healthCheck: () => Promise, + options: WaitOptions = {}, +): Promise { + const { timeoutMs = 30_000, intervalMs = 500, label = 'service' } = options; + + const deadline = Date.now() + timeoutMs; + + // Use do-while to guarantee at least one health check attempt, + // even when timeoutMs is 0 or very small. + do { + try { + if (await healthCheck()) { + return; + } + } catch { + // Health check threw — keep retrying + } + + const remaining = deadline - Date.now(); + if (remaining <= 0) break; + + // Cap sleep to the remaining time so we don't overshoot the deadline + await new Promise((resolve) => + setTimeout(resolve, Math.min(intervalMs, remaining)), + ); + } while (Date.now() < deadline); + + throw new Error(`${label} did not become healthy within ${timeoutMs}ms`); +} + +/** + * Waits for an HTTP service to respond on the given host and port. + * Note: this uses HTTP fetch, so it only works for HTTP services (not raw TCP like Redis/Postgres). + */ +export async function waitForHttpService( + host: string, + port: number, + options: WaitOptions = {}, +): Promise { + return waitForService( + async () => { + try { + const response = await fetch(`http://${host}:${port}`); + return response.ok || response.status < 500; + } catch { + return false; + } + }, + { ...options, label: options.label ?? `${host}:${port}` }, + ); +} + +/** + * Waits for the Toxiproxy API to be ready. + */ +export async function waitForToxiproxy( + apiUrl = 'http://localhost:8474', + options: WaitOptions = {}, +): Promise { + return waitForService( + async () => { + try { + const response = await fetch(`${apiUrl}/version`); + return response.ok; + } catch { + return false; + } + }, + { ...options, label: options.label ?? 'Toxiproxy' }, + ); +} diff --git a/libs/chaos/src/index.ts b/libs/chaos/src/index.ts new file mode 100644 index 0000000..e12948b --- /dev/null +++ b/libs/chaos/src/index.ts @@ -0,0 +1,4 @@ +export * from './toxiproxy/index.js'; +export * from './scenarios/index.js'; +export * from './report/index.js'; +export * from './helpers/index.js'; diff --git a/libs/chaos/src/report/index.ts b/libs/chaos/src/report/index.ts new file mode 100644 index 0000000..e50adfa --- /dev/null +++ b/libs/chaos/src/report/index.ts @@ -0,0 +1,2 @@ +export { createReportCollector, generateReport } from './reporter.js'; +export type { ResilienceReport } from './reporter.js'; diff --git a/libs/chaos/src/report/reporter.ts b/libs/chaos/src/report/reporter.ts new file mode 100644 index 0000000..9f9ccd8 --- /dev/null +++ b/libs/chaos/src/report/reporter.ts @@ -0,0 +1,82 @@ +import type { ChaosScenarioResult } from '../scenarios/scenario.js'; + +export interface ResilienceReport { + app: string; + results: ChaosScenarioResult[]; + generatedAt: string; +} + +/** + * Generates a markdown resilience report from chaos test results. + */ +export function generateReport(reports: ResilienceReport[]): string { + const lines: string[] = []; + const now = reports[0]?.generatedAt ?? new Date().toISOString(); + + lines.push('# Resilience Report — System Craft'); + lines.push(`Generated: ${now}`); + lines.push(''); + + let totalScenarios = 0; + let totalPassed = 0; + let totalGraceful = 0; + + for (const report of reports) { + lines.push(`## ${report.app}`); + lines.push(''); + lines.push( + '| Scenario | Result | Graceful? | Recovery | Duration | Notes |', + ); + lines.push( + '|----------|--------|-----------|----------|----------|-------|', + ); + + for (const r of report.results) { + totalScenarios++; + if (r.passed) totalPassed++; + if (r.graceful) totalGraceful++; + + lines.push( + `| ${r.scenario} | ${r.passed ? 'PASS' : 'FAIL'} | ${r.graceful ? 'YES' : 'NO'} | ${r.recovery} | ${r.durationMs}ms | ${r.notes} |`, + ); + } + + lines.push(''); + } + + lines.push('## Summary'); + lines.push(''); + lines.push(`- **Total scenarios:** ${totalScenarios}`); + lines.push(`- **Passed:** ${totalPassed}`); + lines.push(`- **Failed:** ${totalScenarios - totalPassed}`); + lines.push(`- **Graceful degradation:** ${totalGraceful}/${totalScenarios}`); + lines.push( + `- **Resilience score:** ${totalPassed}/${totalScenarios} (${totalScenarios > 0 ? Math.round((totalPassed / totalScenarios) * 100) : 0}%)`, + ); + lines.push(''); + + return lines.join('\n'); +} + +/** + * Collects results from Jest test runs into a ResilienceReport. + */ +export function createReportCollector(app: string): { + record: (result: ChaosScenarioResult) => void; + toReport: () => ResilienceReport; +} { + const results: ChaosScenarioResult[] = []; + + return { + record(result: ChaosScenarioResult) { + results.push(result); + }, + toReport(): ResilienceReport { + return { + app, + results, + generatedAt: new Date().toISOString(), + }; + }, + }; +} diff --git a/libs/chaos/src/scenarios/index.ts b/libs/chaos/src/scenarios/index.ts new file mode 100644 index 0000000..fe960ec --- /dev/null +++ b/libs/chaos/src/scenarios/index.ts @@ -0,0 +1,2 @@ +export { runScenario } from './scenario.js'; +export type { ChaosScenario, ChaosScenarioResult } from './scenario.js'; diff --git a/libs/chaos/src/scenarios/scenario.ts b/libs/chaos/src/scenarios/scenario.ts new file mode 100644 index 0000000..5b0cba6 --- /dev/null +++ b/libs/chaos/src/scenarios/scenario.ts @@ -0,0 +1,67 @@ +import type { ToxicConfig } from '../toxiproxy/types.js'; + +export interface ChaosScenarioResult { + scenario: string; + passed: boolean; + graceful: boolean; + recovery: 'auto' | 'manual' | 'n/a'; + durationMs: number; + notes: string; +} + +export interface ChaosScenario { + /** Human-readable name for the scenario */ + name: string; + /** What this scenario tests */ + description: string; + /** Which Toxiproxy proxy to target */ + proxy: string; + /** What failure to inject (null = disable proxy entirely) */ + toxic: ToxicConfig | null; + /** What the app should do during the failure */ + expectedBehavior: string; +} + +/** + * Runs a chaos scenario, recording timing and results into the report collector. + * Use this in your chaos test files to avoid duplicating the timing/recording logic. + */ +export async function runScenario( + collector: { record: (result: ChaosScenarioResult) => void }, + name: string, + fn: () => Promise>, +): Promise { + const start = Date.now(); + try { + const partial = await fn(); + const result: ChaosScenarioResult = { + scenario: name, + passed: partial.passed ?? true, + graceful: partial.graceful ?? true, + recovery: partial.recovery ?? 'n/a', + durationMs: Date.now() - start, + notes: partial.notes ?? '', + }; + collector.record(result); + + if (!result.passed) { + throw new Error(`Chaos scenario "${name}" failed: ${result.notes}`); + } + } catch (error) { + // Re-throw our own failure errors so Jest marks the test red + if (error instanceof Error && error.message.startsWith('Chaos scenario')) { + throw error; + } + // Unexpected error — record and throw + const result: ChaosScenarioResult = { + scenario: name, + passed: false, + graceful: false, + recovery: 'n/a', + durationMs: Date.now() - start, + notes: `Unexpected error: ${error instanceof Error ? error.message : String(error)}`, + }; + collector.record(result); + throw new Error(`Chaos scenario "${name}" crashed: ${result.notes}`); + } +} diff --git a/libs/chaos/src/toxiproxy/client.spec.ts b/libs/chaos/src/toxiproxy/client.spec.ts new file mode 100644 index 0000000..0d0398a --- /dev/null +++ b/libs/chaos/src/toxiproxy/client.spec.ts @@ -0,0 +1,221 @@ +import { ToxiproxyClient } from './client.js'; +import type { Proxy, Toxic } from './types.js'; + +const mockProxy: Proxy = { + name: 'test-redis', + listen: '0.0.0.0:6380', + upstream: 'redis:6379', + enabled: true, + toxics: [], +}; + +const mockToxic: Toxic = { + name: 'latency_downstream', + type: 'latency', + stream: 'downstream', + toxicity: 1.0, + attributes: { latency: 500, jitter: 100 }, +}; + +describe('ToxiproxyClient', () => { + let client: ToxiproxyClient; + let fetchSpy: jest.SpyInstance; + + beforeEach(() => { + client = new ToxiproxyClient('http://localhost:8474'); + fetchSpy = jest.spyOn(globalThis, 'fetch'); + }); + + afterEach(() => { + fetchSpy.mockRestore(); + }); + + function mockFetchResponse(body: unknown, status = 200): void { + fetchSpy.mockResolvedValueOnce({ + ok: status >= 200 && status < 300, + status, + json: () => Promise.resolve(body), + text: () => Promise.resolve(JSON.stringify(body)), + } as Response); + } + + function mockFetchNoContent(): void { + fetchSpy.mockResolvedValueOnce({ + ok: true, + status: 204, + json: () => Promise.resolve(undefined), + text: () => Promise.resolve(''), + } as Response); + } + + describe('createProxy', () => { + it('should POST to /proxies with the config', async () => { + mockFetchResponse(mockProxy); + + const result = await client.createProxy({ + name: 'test-redis', + listen: '0.0.0.0:6380', + upstream: 'redis:6379', + }); + + expect(fetchSpy).toHaveBeenCalledWith( + 'http://localhost:8474/proxies', + expect.objectContaining({ + method: 'POST', + body: JSON.stringify({ + name: 'test-redis', + listen: '0.0.0.0:6380', + upstream: 'redis:6379', + }), + }), + ); + expect(result).toEqual(mockProxy); + }); + }); + + describe('getProxy', () => { + it('should GET /proxies/:name', async () => { + mockFetchResponse(mockProxy); + + const result = await client.getProxy('test-redis'); + + expect(fetchSpy).toHaveBeenCalledWith( + 'http://localhost:8474/proxies/test-redis', + expect.objectContaining({ headers: expect.any(Object) }), + ); + expect(result).toEqual(mockProxy); + }); + }); + + describe('disableProxy', () => { + it('should PATCH with enabled: false', async () => { + mockFetchResponse({ ...mockProxy, enabled: false }); + + const result = await client.disableProxy('test-redis'); + + expect(fetchSpy).toHaveBeenCalledWith( + 'http://localhost:8474/proxies/test-redis', + expect.objectContaining({ + method: 'PATCH', + body: JSON.stringify({ enabled: false }), + }), + ); + expect(result.enabled).toBe(false); + }); + }); + + describe('enableProxy', () => { + it('should PATCH with enabled: true', async () => { + mockFetchResponse(mockProxy); + + const result = await client.enableProxy('test-redis'); + + expect(fetchSpy).toHaveBeenCalledWith( + 'http://localhost:8474/proxies/test-redis', + expect.objectContaining({ + method: 'PATCH', + body: JSON.stringify({ enabled: true }), + }), + ); + expect(result.enabled).toBe(true); + }); + }); + + describe('addToxic', () => { + it('should POST toxic config with defaults', async () => { + mockFetchResponse(mockToxic); + + const result = await client.addToxic('test-redis', { + name: 'latency_downstream', + type: 'latency', + attributes: { latency: 500, jitter: 100 }, + }); + + expect(fetchSpy).toHaveBeenCalledWith( + 'http://localhost:8474/proxies/test-redis/toxics', + expect.objectContaining({ + method: 'POST', + body: JSON.stringify({ + name: 'latency_downstream', + type: 'latency', + attributes: { latency: 500, jitter: 100 }, + stream: 'downstream', + toxicity: 1.0, + }), + }), + ); + expect(result).toEqual(mockToxic); + }); + }); + + describe('removeToxic', () => { + it('should DELETE /proxies/:name/toxics/:toxicName', async () => { + mockFetchNoContent(); + + await client.removeToxic('test-redis', 'latency_downstream'); + + expect(fetchSpy).toHaveBeenCalledWith( + 'http://localhost:8474/proxies/test-redis/toxics/latency_downstream', + expect.objectContaining({ method: 'DELETE' }), + ); + }); + }); + + describe('reset', () => { + it('should POST to /reset', async () => { + mockFetchNoContent(); + + await client.reset(); + + expect(fetchSpy).toHaveBeenCalledWith( + 'http://localhost:8474/reset', + expect.objectContaining({ method: 'POST' }), + ); + }); + }); + + describe('resetProxy', () => { + it('should remove all toxics and re-enable the proxy', async () => { + // getToxics response + mockFetchResponse([mockToxic]); + // removeToxic response + mockFetchNoContent(); + // enableProxy response + mockFetchResponse(mockProxy); + + await client.resetProxy('test-redis'); + + expect(fetchSpy).toHaveBeenCalledTimes(3); + }); + }); + + describe('isHealthy', () => { + it('should return true when API is reachable', async () => { + fetchSpy.mockResolvedValueOnce({ ok: true } as Response); + + const result = await client.isHealthy(); + expect(result).toBe(true); + }); + + it('should return false when API is unreachable', async () => { + fetchSpy.mockRejectedValueOnce(new Error('ECONNREFUSED')); + + const result = await client.isHealthy(); + expect(result).toBe(false); + }); + }); + + describe('error handling', () => { + it('should throw with status and body on non-OK response', async () => { + fetchSpy.mockResolvedValueOnce({ + ok: false, + status: 404, + text: () => Promise.resolve('proxy not found'), + } as unknown as Response); + + await expect(client.getProxy('missing')).rejects.toThrow( + 'Toxiproxy GET /proxies/missing failed (404): proxy not found', + ); + }); + }); +}); diff --git a/libs/chaos/src/toxiproxy/client.ts b/libs/chaos/src/toxiproxy/client.ts new file mode 100644 index 0000000..e66231b --- /dev/null +++ b/libs/chaos/src/toxiproxy/client.ts @@ -0,0 +1,140 @@ +import type { Proxy, ProxyConfig, Toxic, ToxicConfig } from './types.js'; + +export class ToxiproxyClient { + constructor(private readonly apiUrl = 'http://localhost:8474') {} + + private async request( + path: string, + options: RequestInit = {}, + ): Promise { + const response = await fetch(`${this.apiUrl}${path}`, { + ...options, + headers: { + 'Content-Type': 'application/json', + ...options.headers, + }, + }); + + if (!response.ok) { + const body = await response.text(); + throw new Error( + `Toxiproxy ${options.method ?? 'GET'} ${path} failed (${response.status}): ${body}`, + ); + } + + if (response.status === 204) { + return undefined as T; + } + + // Buffer body as text first — Response bodies are single-use streams, + // so calling .json() then .text() in a catch would always fail. + const text = await response.text(); + try { + return JSON.parse(text) as T; + } catch { + throw new Error( + `Toxiproxy ${options.method ?? 'GET'} ${path} returned non-JSON response (${response.status}): ${text}`, + ); + } + } + + async getProxies(): Promise> { + return this.request>('/proxies'); + } + + async createProxy(config: ProxyConfig): Promise { + return this.request('/proxies', { + method: 'POST', + body: JSON.stringify(config), + }); + } + + async getProxy(name: string): Promise { + return this.request(`/proxies/${name}`); + } + + async deleteProxy(name: string): Promise { + return this.request(`/proxies/${name}`, { method: 'DELETE' }); + } + + async updateProxy( + name: string, + config: Partial, + ): Promise { + return this.request(`/proxies/${name}`, { + method: 'PATCH', + body: JSON.stringify(config), + }); + } + + async disableProxy(name: string): Promise { + return this.updateProxy(name, { enabled: false }); + } + + async enableProxy(name: string): Promise { + return this.updateProxy(name, { enabled: true }); + } + + async addToxic(proxyName: string, toxic: ToxicConfig): Promise { + return this.request(`/proxies/${proxyName}/toxics`, { + method: 'POST', + body: JSON.stringify({ + ...toxic, + stream: toxic.stream ?? 'downstream', + toxicity: toxic.toxicity ?? 1.0, + }), + }); + } + + async getToxics(proxyName: string): Promise { + return this.request(`/proxies/${proxyName}/toxics`); + } + + async removeToxic(proxyName: string, toxicName: string): Promise { + return this.request(`/proxies/${proxyName}/toxics/${toxicName}`, { + method: 'DELETE', + }); + } + + async updateToxic( + proxyName: string, + toxicName: string, + toxic: Partial, + ): Promise { + return this.request(`/proxies/${proxyName}/toxics/${toxicName}`, { + method: 'PATCH', + body: JSON.stringify(toxic), + }); + } + + async reset(): Promise { + return this.request('/reset', { method: 'POST' }); + } + + async resetProxy(proxyName: string): Promise { + const toxics = await this.getToxics(proxyName); + await Promise.all(toxics.map((t) => this.removeToxic(proxyName, t.name))); + await this.enableProxy(proxyName); + } + + async ensureProxy(config: ProxyConfig): Promise { + try { + return await this.createProxy(config); + } catch (error) { + if (error instanceof Error && error.message.includes('409')) { + await this.resetProxy(config.name); + return this.getProxy(config.name); + } + throw error; + } + } + + async isHealthy(): Promise { + try { + const response = await fetch(`${this.apiUrl}/version`); + return response.ok; + } catch { + return false; + } + } +} diff --git a/libs/chaos/src/toxiproxy/index.ts b/libs/chaos/src/toxiproxy/index.ts new file mode 100644 index 0000000..e622b2c --- /dev/null +++ b/libs/chaos/src/toxiproxy/index.ts @@ -0,0 +1,10 @@ +export { ToxiproxyClient } from './client.js'; +export type { + Proxy, + ProxyConfig, + Toxic, + ToxicAttributes, + ToxicConfig, + ToxicStream, + ToxicType, +} from './types.js'; diff --git a/libs/chaos/src/toxiproxy/types.ts b/libs/chaos/src/toxiproxy/types.ts new file mode 100644 index 0000000..275b0b5 --- /dev/null +++ b/libs/chaos/src/toxiproxy/types.ts @@ -0,0 +1,53 @@ +export type ToxicType = + | 'latency' + | 'bandwidth' + | 'slow_close' + | 'timeout' + | 'slicer' + | 'limit_data'; + +export type ToxicStream = 'upstream' | 'downstream'; + +export interface ToxicAttributes { + /** Latency in ms (for 'latency' toxic) */ + latency?: number; + /** Jitter in ms (for 'latency' toxic) */ + jitter?: number; + /** Rate in KB/s (for 'bandwidth' toxic) */ + rate?: number; + /** Delay in ms before closing (for 'slow_close' toxic) */ + delay?: number; + /** Timeout in ms (for 'timeout' toxic) */ + timeout?: number; + /** Average size of sliced bytes (for 'slicer' toxic) */ + average_size?: number; + /** Size variation (for 'slicer' toxic) */ + size_variation?: number; + /** Bytes to allow before cutting (for 'limit_data' toxic) */ + bytes?: number; +} + +export interface ToxicConfig { + name: string; + type: ToxicType; + stream?: ToxicStream; + toxicity?: number; + attributes: ToxicAttributes; +} + +export interface Toxic extends ToxicConfig { + stream: ToxicStream; + toxicity: number; +} + +export interface ProxyConfig { + name: string; + listen: string; + upstream: string; + enabled?: boolean; +} + +export interface Proxy extends ProxyConfig { + enabled: boolean; + toxics: Toxic[]; +} diff --git a/libs/chaos/tsconfig.json b/libs/chaos/tsconfig.json new file mode 100644 index 0000000..c23e61c --- /dev/null +++ b/libs/chaos/tsconfig.json @@ -0,0 +1,10 @@ +{ + "extends": "../../tsconfig.base.json", + "files": [], + "include": [], + "references": [ + { + "path": "./tsconfig.lib.json" + } + ] +} diff --git a/libs/chaos/tsconfig.lib.json b/libs/chaos/tsconfig.lib.json new file mode 100644 index 0000000..8fac478 --- /dev/null +++ b/libs/chaos/tsconfig.lib.json @@ -0,0 +1,15 @@ +{ + "extends": "../../tsconfig.base.json", + "compilerOptions": { + "baseUrl": ".", + "rootDir": "src", + "outDir": "dist", + "tsBuildInfoFile": "dist/tsconfig.lib.tsbuildinfo", + "emitDeclarationOnly": true, + "forceConsistentCasingInFileNames": true, + "types": ["node"] + }, + "include": ["src/**/*.ts"], + "exclude": ["src/**/*.spec.ts"], + "references": [] +} diff --git a/nx.json b/nx.json index d4a732c..a3c8fad 100644 --- a/nx.json +++ b/nx.json @@ -29,7 +29,7 @@ "watchDepsName": "watch-deps" } }, - "exclude": ["libs/shared/*"] + "exclude": ["libs/shared/*", "libs/chaos/*"] }, { "plugin": "@nx/webpack/plugin", @@ -56,7 +56,7 @@ }, { "plugin": "@nx/js/typescript", - "include": ["libs/shared/*"], + "include": ["libs/shared/*", "libs/chaos/*"], "options": { "typecheck": { "targetName": "typecheck" diff --git a/package.json b/package.json index 4fc0cad..5719539 100644 --- a/package.json +++ b/package.json @@ -30,6 +30,7 @@ "@commitlint/cli": "^20.2.0", "@commitlint/config-conventional": "^20.2.0", "@eslint/js": "^9.39.2", + "@libs/chaos": "workspace:*", "@libs/shared": "workspace:*", "@nestjs/schematics": "^11.0.9", "@nestjs/testing": "^11.1.10", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index e01b278..37258f3 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -71,6 +71,9 @@ importers: '@eslint/js': specifier: ^9.39.2 version: 9.39.2 + '@libs/chaos': + specifier: workspace:* + version: link:libs/chaos '@libs/shared': specifier: workspace:* version: link:libs/shared @@ -284,6 +287,8 @@ importers: specifier: workspace:* version: link:../../libs/shared + libs/chaos: {} + libs/shared: dependencies: zod: diff --git a/tsconfig.json b/tsconfig.json index c6739a8..2e1907b 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -23,6 +23,9 @@ }, { "path": "./e2e/rate-limiter" + }, + { + "path": "./libs/chaos" } ] }