From 473c6da8f7387aee7b1ab01a2006c9bebe3c11a1 Mon Sep 17 00:00:00 2001
From: Juhye0k <kyoung1678@naver.com>
Date: Mon, 9 Mar 2026 18:08:04 +0900
Subject: [PATCH 1/3] =?UTF-8?q?refactor=20:=20fcm=20=EC=84=A4=EC=A0=95?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../fcm/service/FcmMessageSender.java         | 35 +++++--------------
 .../fcm/service/FcmService.java               |  1 -
 .../study/domain/StudySession.java            |  2 +-
 .../scheduler/MaxFocusStudyScheduler.java     |  6 +---
 .../study/service/StudySessionService.java    |  2 +-
 5 files changed, 12 insertions(+), 34 deletions(-)

diff --git a/src/main/java/com/gpt/geumpumtabackend/fcm/service/FcmMessageSender.java b/src/main/java/com/gpt/geumpumtabackend/fcm/service/FcmMessageSender.java
index d19fd03..0e86c69 100644
--- a/src/main/java/com/gpt/geumpumtabackend/fcm/service/FcmMessageSender.java
+++ b/src/main/java/com/gpt/geumpumtabackend/fcm/service/FcmMessageSender.java
@@ -6,12 +6,10 @@
 import com.google.firebase.messaging.MessagingErrorCode;
 import com.google.firebase.messaging.Notification;
 import com.gpt.geumpumtabackend.fcm.dto.FcmMessageDto;
+import com.gpt.geumpumtabackend.fcm.exception.PermanentFcmException;
 import com.gpt.geumpumtabackend.global.exception.BusinessException;
 import com.gpt.geumpumtabackend.global.exception.ExceptionType;
-import com.gpt.geumpumtabackend.user.domain.User;
-import com.gpt.geumpumtabackend.user.repository.UserRepository;
 import java.util.Set;
-import lombok.RequiredArgsConstructor;
 import lombok.extern.slf4j.Slf4j;
 import org.springframework.retry.annotation.Backoff;
 import org.springframework.retry.annotation.Recover;
@@ -19,21 +17,19 @@
 import org.springframework.stereotype.Component;
 
 @Component
-@RequiredArgsConstructor
 @Slf4j
 public class FcmMessageSender {
 
-    private static final Set<MessagingErrorCode> PERMANENT_ERROR_CODES = Set.of(
+    private static final Set<MessagingErrorCode> NON_RETRYABLE_ERROR_CODES = Set.of(
             MessagingErrorCode.UNREGISTERED,
             MessagingErrorCode.INVALID_ARGUMENT,
             MessagingErrorCode.SENDER_ID_MISMATCH,
             MessagingErrorCode.THIRD_PARTY_AUTH_ERROR
     );
 
-    private final UserRepository userRepository;
-
     @Retryable(
             retryFor = FirebaseMessagingException.class,
+            noRetryFor = PermanentFcmException.class,
             maxAttempts = 3,
             backoff = @Backoff(delay = 1000, multiplier = 2)
     )
@@ -55,7 +51,12 @@ public void send(FcmMessageDto messageDto) throws FirebaseMessagingException {
         try {
             FirebaseMessaging.getInstance().send(messageBuilder.build());
         } catch (FirebaseMessagingException e) {
-            handleSendFailure(e, messageDto.getToken());
+            MessagingErrorCode code = e.getMessagingErrorCode();
+            if (code != null && NON_RETRYABLE_ERROR_CODES.contains(code)) {
+                log.warn("FCM permanent error [{}] for token {}: {}", code, messageDto.getToken(), e.getMessage());
+                throw new PermanentFcmException(e);
+            }
+            throw e;
         }
     }
 
@@ -64,22 +65,4 @@ public void sendRecover(FirebaseMessagingException e, FcmMessageDto messageDto)
         log.error("FCM send failed after 3 retries for token {}", messageDto.getToken(), e);
         throw new BusinessException(ExceptionType.FCM_SEND_FAILED);
     }
-
-    private void handleSendFailure(FirebaseMessagingException e, String token)
-            throws FirebaseMessagingException {
-        MessagingErrorCode errorCode = e.getMessagingErrorCode();
-
-        if (errorCode == null || !PERMANENT_ERROR_CODES.contains(errorCode)) {
-            throw e;
-        }
-
-        if (errorCode == MessagingErrorCode.UNREGISTERED) {
-            log.warn("FCM token unregistered, clearing token: {}", token);
-            userRepository.findByFcmToken(token)
-                    .ifPresent(User::clearFcmToken);
-            return;
-        }
-
-        log.warn("FCM permanent error [{}] for token {}: {}", errorCode, token, e.getMessage());
-    }
 }
diff --git a/src/main/java/com/gpt/geumpumtabackend/fcm/service/FcmService.java b/src/main/java/com/gpt/geumpumtabackend/fcm/service/FcmService.java
index b6a32e4..1f9af6b 100644
--- a/src/main/java/com/gpt/geumpumtabackend/fcm/service/FcmService.java
+++ b/src/main/java/com/gpt/geumpumtabackend/fcm/service/FcmService.java
@@ -14,7 +14,6 @@
 @Service
 @RequiredArgsConstructor
 @Slf4j
-@Transactional(readOnly = true)
 public class FcmService {
 
     private final UserRepository userRepository;
diff --git a/src/main/java/com/gpt/geumpumtabackend/study/domain/StudySession.java b/src/main/java/com/gpt/geumpumtabackend/study/domain/StudySession.java
index 82b25e7..cc3b580 100644
--- a/src/main/java/com/gpt/geumpumtabackend/study/domain/StudySession.java
+++ b/src/main/java/com/gpt/geumpumtabackend/study/domain/StudySession.java
@@ -48,7 +48,7 @@ public void endStudySession(LocalDateTime endTime) {
         this.totalMillis = Duration.between(this.startTime, this.endTime).toMillis();
     }
     public void endMaxFocusStudySession(int maxFocusTime) {
-        this.endTime = this.startTime.plusMinutes(maxFocusTime);
+        this.endTime = this.startTime.plusHours(maxFocusTime);
         status = StudyStatus.FINISHED;
         this.totalMillis = Duration.between(this.startTime, this.endTime).toMillis();
     }
diff --git a/src/main/java/com/gpt/geumpumtabackend/study/scheduler/MaxFocusStudyScheduler.java b/src/main/java/com/gpt/geumpumtabackend/study/scheduler/MaxFocusStudyScheduler.java
index d1c9fab..285f533 100644
--- a/src/main/java/com/gpt/geumpumtabackend/study/scheduler/MaxFocusStudyScheduler.java
+++ b/src/main/java/com/gpt/geumpumtabackend/study/scheduler/MaxFocusStudyScheduler.java
@@ -26,11 +26,7 @@ public void checkAndFinishMaxFocusSessions() {
 
             int maxFocusHours = studyProperties.getMaxFocusHours();
             for (User user : usersToNotify) {
-                try {
-                    fcmService.sendMaxFocusNotification(user, maxFocusHours);
-                } catch (Exception e) {
-                    log.error("Failed to send FCM max focus notification for user {}", user.getId(), e);
-                }
+                fcmService.sendMaxFocusNotification(user, maxFocusHours);
             }
         } catch (Exception e) {
             log.error("[MAX_FOCUS_SCHEDULER] Failed to check max focus sessions", e);
diff --git a/src/main/java/com/gpt/geumpumtabackend/study/service/StudySessionService.java b/src/main/java/com/gpt/geumpumtabackend/study/service/StudySessionService.java
index 0f022ce..486ae42 100644
--- a/src/main/java/com/gpt/geumpumtabackend/study/service/StudySessionService.java
+++ b/src/main/java/com/gpt/geumpumtabackend/study/service/StudySessionService.java
@@ -99,7 +99,7 @@ public StudySession makeStudySession(Long userId){
     @Transactional
     public List<User> endExpiredMaxFocusSessions() {
         int maxFocusHours = studyProperties.getMaxFocusHours();
-        LocalDateTime cutoffTime = LocalDateTime.now().minusMinutes(maxFocusHours);
+        LocalDateTime cutoffTime = LocalDateTime.now().minusHours(maxFocusHours);
 
         List<StudySession> expiredSessions = studySessionRepository.findAllByStatusAndStartTimeBefore(
                 StudyStatus.STARTED, cutoffTime

From 8d4515def8bfe4c32dfc5631e17082c753196b1b Mon Sep 17 00:00:00 2001
From: Juhye0k <kyoung1678@naver.com>
Date: Fri, 3 Apr 2026 16:45:21 +0900
Subject: [PATCH 2/3] =?UTF-8?q?refactor=20:=20claude=20skill,=20md=20?=
 =?UTF-8?q?=EC=97=85=EB=8D=B0=EC=9D=B4=ED=8A=B8?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .ai/ARCHITECTURE.md                           |  250 ----
 .ai/TESTING.md                                |  249 ----
 .ai/USE-CASES.md                              |  349 -----
 .claude/agents/api-documenter.md              |  276 ++++
 .claude/agents/backend-architect.md           |   51 +
 .claude/agents/code-reviewer.md               |  286 ++++
 .claude/agents/test-engineer.md               |  935 ++++++++++++
 .claude/skills/incident-response/SKILL.md     |  254 ++++
 .../references/grafana-queries.md             |  142 ++
 .../references/phase-guide.md                 |  122 ++
 .../incident-response/references/templates.md |  184 +++
 .claude/skills/skill-creator/SKILL.md         |  485 ++++++
 .../skills/skill-creator/agents/analyzer.md   |  274 ++++
 .../skills/skill-creator/agents/comparator.md |  202 +++
 .claude/skills/skill-creator/agents/grader.md |  223 +++
 .../skill-creator/assets/eval_review.html     |  146 ++
 .../eval-viewer/generate_review.py            |  471 ++++++
 .../skill-creator/eval-viewer/viewer.html     | 1325 +++++++++++++++++
 .../skill-creator/references/schemas.md       |  430 ++++++
 .../skills/skill-creator/scripts/__init__.py  |    0
 .../scripts/aggregate_benchmark.py            |  401 +++++
 .../skill-creator/scripts/generate_report.py  |  326 ++++
 .../scripts/improve_description.py            |  247 +++
 .../skill-creator/scripts/package_skill.py    |  136 ++
 .../skill-creator/scripts/quick_validate.py   |  103 ++
 .../skills/skill-creator/scripts/run_eval.py  |  310 ++++
 .../skills/skill-creator/scripts/run_loop.py  |  328 ++++
 .claude/skills/skill-creator/scripts/utils.py |   47 +
 .claude/skills/spring-core/SKILL.md           |  105 ++
 .../spring-core/references/exceptions.md      |   65 +
 .../spring-core/references/templates.md       |  230 +++
 .claude/skills/spring-review/SKILL.md         |  175 +++
 .claude/skills/spring-test/SKILL.md           |  368 +++++
 .claude/skills/spring-test/evals/evals.json   |   23 +
 .mcp.json                                     |   18 +
 CLAUDE.md                                     |  328 +---
 .../com/gpt/geumpumtabackend/fcm/CLAUDE.md    |  116 ++
 .../fcm/exception/PermanentFcmException.java  |    9 +
 .../com/gpt/geumpumtabackend/rank/CLAUDE.md   |  225 +--
 .../SeasonDepartmentRankingResponse.java      |    2 -
 40 files changed, 9001 insertions(+), 1215 deletions(-)
 delete mode 100644 .ai/ARCHITECTURE.md
 delete mode 100644 .ai/TESTING.md
 delete mode 100644 .ai/USE-CASES.md
 create mode 100644 .claude/agents/api-documenter.md
 create mode 100644 .claude/agents/backend-architect.md
 create mode 100644 .claude/agents/code-reviewer.md
 create mode 100644 .claude/agents/test-engineer.md
 create mode 100644 .claude/skills/incident-response/SKILL.md
 create mode 100644 .claude/skills/incident-response/references/grafana-queries.md
 create mode 100644 .claude/skills/incident-response/references/phase-guide.md
 create mode 100644 .claude/skills/incident-response/references/templates.md
 create mode 100644 .claude/skills/skill-creator/SKILL.md
 create mode 100644 .claude/skills/skill-creator/agents/analyzer.md
 create mode 100644 .claude/skills/skill-creator/agents/comparator.md
 create mode 100644 .claude/skills/skill-creator/agents/grader.md
 create mode 100644 .claude/skills/skill-creator/assets/eval_review.html
 create mode 100644 .claude/skills/skill-creator/eval-viewer/generate_review.py
 create mode 100644 .claude/skills/skill-creator/eval-viewer/viewer.html
 create mode 100644 .claude/skills/skill-creator/references/schemas.md
 create mode 100644 .claude/skills/skill-creator/scripts/__init__.py
 create mode 100644 .claude/skills/skill-creator/scripts/aggregate_benchmark.py
 create mode 100644 .claude/skills/skill-creator/scripts/generate_report.py
 create mode 100644 .claude/skills/skill-creator/scripts/improve_description.py
 create mode 100644 .claude/skills/skill-creator/scripts/package_skill.py
 create mode 100644 .claude/skills/skill-creator/scripts/quick_validate.py
 create mode 100644 .claude/skills/skill-creator/scripts/run_eval.py
 create mode 100644 .claude/skills/skill-creator/scripts/run_loop.py
 create mode 100644 .claude/skills/skill-creator/scripts/utils.py
 create mode 100644 .claude/skills/spring-core/SKILL.md
 create mode 100644 .claude/skills/spring-core/references/exceptions.md
 create mode 100644 .claude/skills/spring-core/references/templates.md
 create mode 100644 .claude/skills/spring-review/SKILL.md
 create mode 100644 .claude/skills/spring-test/SKILL.md
 create mode 100644 .claude/skills/spring-test/evals/evals.json
 create mode 100644 .mcp.json
 create mode 100644 src/main/java/com/gpt/geumpumtabackend/fcm/CLAUDE.md
 create mode 100644 src/main/java/com/gpt/geumpumtabackend/fcm/exception/PermanentFcmException.java

diff --git a/.ai/ARCHITECTURE.md b/.ai/ARCHITECTURE.md
deleted file mode 100644
index 031ca9f..0000000
--- a/.ai/ARCHITECTURE.md
+++ /dev/null
@@ -1,250 +0,0 @@
-# ARCHITECTURE.md
-
-Geumpumta 백엔드 시스템 아키텍처 문서.
-
----
-
-## 1. 시스템 개요
-
-```
-┌─────────────────────────────────────────────────────────────────┐
-│                      클라이언트 (모바일 앱)                        │
-└────────────────────────────┬────────────────────────────────────┘
-                             │ HTTPS
-                             ▼
-┌─────────────────────────────────────────────────────────────────┐
-│  Security Filter Chain                                          │
-│  CORS → OAuth2Login → JwtAuthenticationFilter → @PreAuthorize   │
-├─────────────────────────────────────────────────────────────────┤
-│  Controller Layer  (@AssignUserId AOP → userId 자동 주입)         │
-├─────────────────────────────────────────────────────────────────┤
-│  Service Layer                                                  │
-│  study │ rank │ statistics │ user │ token │ board │ fcm │ wifi  │
-├─────────────────────────────────────────────────────────────────┤
-│  Repository Layer  (JPA │ Native Query │ JDBC Batch │ Redis)    │
-├─────────────────────────────────────────────────────────────────┤
-│  Scheduler Layer                                                │
-│  RankingScheduler │ SeasonTransition │ MaxFocus │ TokenCleanup  │
-└────────┬──────────┬──────────┬──────────┬───────────────────────┘
-         │          │          │          │
-   ┌─────▼───┐ ┌───▼────┐ ┌──▼───┐ ┌───▼──────┐
-   │ MySQL 8 │ │ Redis  │ │ FCM  │ │Cloudinary│
-   └─────────┘ └────────┘ └──────┘ └──────────┘
-```
-
----
-
-## 2. 엔티티 관계도
-
-```
-                  ┌─────────────┐
-                  │    User     │
-                  │ role        │ GUEST → USER → ADMIN
-                  │ department  │ Enum (25개 학과)
-                  │ provider    │ KAKAO, GOOGLE, APPLE
-                  │ fcmToken    │
-                  └──────┬──────┘
-                         │
-          ┌──────────────┼──────────────┐
-          │ 1:N (FK)     │ 1:N (FK)     │ 1:N (FK 없음)
-          ▼              ▼              ▼
-   ┌─────────────┐ ┌───────────┐ ┌─────────────┐
-   │StudySession │ │UserRanking│ │RefreshToken │
-   │ startTime   │ │ rank      │ │ userId      │
-   │ endTime     │ │ totalMillis│ │ refreshToken│
-   │ totalMillis │ │ rankingType│ │ expiredAt   │
-   │ status      │ │calculatedAt│ └─────────────┘
-   └─────────────┘ └───────────┘
-
-┌──────────────────┐  ┌───────────────────────┐  ┌────────┐
-│DepartmentRanking │  │SeasonRankingSnapshot  │  │ Season │
-│ department (Enum)│  │ seasonId (FK없음)      │  │ type   │
-│ rank, totalMillis│  │ userId   (FK없음)      │  │ status │
-│ rankingType      │  │ rankType, finalRank   │  │ start  │
-│ calculatedAt     │  │ department (nullable) │  │ end    │
-└──────────────────┘  └───────────────────────┘  └────────┘
-```
-
-**설계 결정:**
-- `SeasonRankingSnapshot`에 FK 없음 → 시즌/유저 삭제 후에도 이력 보존
-- `RefreshToken`에 FK 없음 → 유저 soft-delete와 독립적으로 토큰 정리
-- `User` soft-delete 시 `deleted_` prefix → unique 제약 유지하면서 재가입 허용
-
----
-
-## 3. 인증 플로우
-
-```
-[OAuth2 로그인]
-앱 → /oauth2/authorization/{provider}?redirect_uri=...
-  → CustomAuthorizationRequestResolver (redirect_uri를 state에 인코딩)
-  → OAuth2 Provider 인증
-  → CustomOAuth2UserService.loadUser() → User 조회/생성 (role=GUEST)
-  → SuccessHandler → JWT 발급 → redirect_uri?accessToken=...&refreshToken=...
-
-[회원가입 완료]
-POST /email/request-code → Redis에 인증코드 (TTL 5분)
-POST /email/verify-code  → 코드 검증
-POST /user/complete-registration → GUEST→USER 승격, 새 JWT 발급
-
-[API 요청]
-Authorization: Bearer {token}
-  → JwtAuthenticationFilter → parseToken (JJWT, HMAC-SHA256)
-  → withdrawn=true이면 /restore 외 차단
-  → @PreAuthorize → @AssignUserId AOP → Controller
-```
-
----
-
-## 4. 랭킹 시스템
-
-### 이중 랭킹 구조
-
-```
-date 파라미터 유무로 분기:
-
-date 없음 (현재 기간)              date 있음 (과거 기간)
-  │                                │
-  ▼                                ▼
-실시간 랭킹                       확정 랭킹
-StudySession Native Query로       UserRanking / DepartmentRanking
-직접 계산 (진행중 세션 포함)         테이블에서 조회 (스케줄러가 저장)
-```
-
-### 시즌 랭킹 계산
-
-```
-현재 시즌 랭킹 = ① + ② + ③ 합산 후 순위 부여
-
-① 확정 월간 합산 (시즌 시작 ~ 전월 말)
-   → UserRankingRepository JPQL
-② 현재 월 일간 합산 (이번 달 1일 ~ 어제)
-   → UserRankingRepository JPQL
-③ 오늘 실시간 데이터
-   → StudySessionRepository Native Query
-
-종료된 시즌 → SeasonRankingSnapshot 불변 스냅샷 조회 (계산 없음)
-```
-
-### 시즌 전환 (매일 00:05)
-
-```
-SeasonTransitionScheduler
-  → 캐시 우회 DB 조회 → today ≥ endDate+1 ?
-  → Yes: activeSeason 캐시 clear
-       → transitionToNextSeason (현재=ENDED, 다음=ACTIVE)
-       → createSeasonSnapshot (@Retryable 3회, JDBC 배치 2000건)
-  → No: return
-```
-
-### 학과 랭킹
-
-학과별 상위 30명의 공부 시간 합산. Native Query + CTE로 25개 학과 처리.
-`ROW_NUMBER() PARTITION BY department` → 상위 30 필터 → `SUM GROUP BY` → `RANK()`.
-
----
-
-## 5. 학습 세션 흐름
-
-```
-[시작] POST /study/start {gatewayIp, clientIp}
-  → WiFi 검증 (@Cacheable) → 중복 STARTED 확인 → 세션 생성 (서버 시간)
-
-[종료] POST /study/end {studySessionId}
-  → 세션 조회 → endTime=서버시간, totalMillis=Duration 계산 → FINISHED
-
-[자동종료] 매 10분 스케줄러
-  → STARTED + 3시간 초과 세션 → 자동 종료 + FCM 알림
-```
-
----
-
-## 6. 크로스 도메인 의존성
-
-### 서비스 의존 그래프
-
-```
-StudySessionService ──→ CampusWiFiValidationService, FcmService
-PersonalRankService ──→ StudySessionRepository, UserRankingRepository
-DepartmentRankService → StudySessionRepository, DepartmentRankingRepository
-SeasonRankService ────→ SeasonService(@Cacheable), UserRankingRepo, StudySessionRepo
-SeasonSnapshotService → UserRankingRepo, SeasonSnapshotBatchService(JDBC)
-StatisticsService ────→ StudySessionRepository (12개 CTE)
-UserService ──────────→ JwtHandler, RefreshTokenRepo, FcmService
-TokenService ─────────→ JwtHandler, RefreshTokenRepo
-```
-
-### StudySessionRepository — 쿼리 허브
-
-3개 도메인(study, rank, statistics)이 공유. 수정 시 전체 영향.
-
-| 쿼리 | 도메인 | 용도 |
-|------|--------|------|
-| `calculateCurrentPeriodRanking` | rank | 실시간 개인 랭킹 |
-| `calculateCurrentDepartmentRanking` | rank | 실시간 학과 랭킹 |
-| `calculateFinalizedPeriodRanking` | rank | 확정 개인 랭킹 배치 |
-| `calculateFinalizedDepartmentRanking` | rank | 확정 학과 랭킹 배치 |
-| `getTwoHourSlotStats` | statistics | 일간 2시간 슬롯 |
-| `getWeeklyStatistics` | statistics | 주간 통계 |
-| `getMonthlyStatistics` | statistics | 월간 통계 |
-| `getGrassStatistics` | statistics | 잔디 차트 (NTILE) |
-| `sumCompletedStudySessionByUserId` | study | 오늘 총 공부 시간 |
-
----
-
-## 7. 캐싱 전략
-
-| 캐시 | 저장소 | 키 | TTL | 무효화 |
-|------|--------|-----|-----|--------|
-| `wifiValidation` | Caffeine | `gatewayIp:clientIp` | 10분 | 자동 만료 |
-| `activeSeason` | Caffeine | 단일 엔트리 | 10분 | 시즌 전환 시 수동 clear |
-| 이메일 인증코드 | Redis | `{userId}email:{email}` | 5분 | 자동 만료 |
-
----
-
-## 8. 스케줄러 타임라인
-
-```
-매일:
-00:00:00  RefreshTokenDelete     만료 토큰 삭제
-00:00:05  DailyRanking           전일 개인/학과 랭킹 확정
-00:05:00  SeasonTransition       시즌 종료 확인 → 전환/스냅샷
-                                 ★ MonthlyRanking(00:02) 이후 실행 (데이터 의존)
-월요일: 00:01  WeeklyRanking
-1일:    00:02  MonthlyRanking
-매 10분: MaxFocusStudy           3시간 초과 세션 자동 종료 + FCM
-```
-
----
-
-## 9. 예외 처리 경로
-
-```
-경로 1: Service 예외
-  throw BusinessException(ExceptionType) → GlobalExceptionHandler
-  → {"success":false, "code":"ST002", "msg":"..."}
-
-경로 2: 인증 예외
-  JwtAuthenticationFilter catch → HttpServletResponse 직접 JSON 작성
-  → {"success":false, "code":"S004", "msg":"..."}
-
-경로 3: Validation 예외
-  @Valid MethodArgumentNotValidException → GlobalExceptionHandler
-  → {"success":false, "code":"C002", "msg":"커스텀 메시지"}
-```
-
-```
-예외 계층:
-RuntimeException
-  ├── BusinessException (ExceptionType: code + message + HttpStatus)
-  └── JwtAuthenticationException
-        ├── JwtTokenExpiredException   (S004, 401)
-        ├── JwtTokenInvalidException   (S005, 401)
-        ├── JwtNotExistException       (S006, 401)
-        └── JwtAccessDeniedException   (S003, 403)
-
-응답 구조:
-ResponseBody<T> (sealed)
-  ├── SuccessResponseBody<T> → {"success":true, "data":{...}}
-  └── FailedResponseBody     → {"success":false, "code":"...", "msg":"..."}
-```
diff --git a/.ai/TESTING.md b/.ai/TESTING.md
deleted file mode 100644
index 11a82ca..0000000
--- a/.ai/TESTING.md
+++ /dev/null
@@ -1,249 +0,0 @@
-# TESTING.md
-
-이 프로젝트의 테스트 전략, 작성 규칙, 커버리지 기준을 정의한다.
-
----
-
-## 테스트 원칙
-
-1. **핵심 비즈니스 로직은 반드시 단위 테스트한다** — 시간 계산, 랭킹 병합, 인증, 상태 전이
-2. **외부 경계는 통합 테스트한다** — HTTP 요청/응답, 인증 필터, Native Query 정합성
-3. **단순 위임/CRUD는 테스트하지 않는다** — `repository.save()` 호출만 하는 메서드, getter/setter
-4. **외부 서비스는 Mock한다** — Cloudinary, FCM, SMTP를 실제 호출하지 않음
-5. **테스트가 실패하면 배포하지 않는다** — CI에서 전체 테스트 통과 필수
-
----
-
-## 커버리지 기준
-
-| 계층 | 목표 | 기준 |
-|------|------|------|
-| Service (핵심 로직) | **85%** | 모든 public 메서드 + 분기 커버 |
-| Domain Entity (상태 전이) | **90%** | 도메인 메서드 전체 |
-| Controller (통합) | **70%** | 정상 + 인증실패 + 주요 예외 |
-| Scheduler (로직) | **80%** | 계산 로직 + 예외 처리 |
-| Repository (복잡 쿼리) | **60%** | Native Query, CTE 정합성 |
-
-### 도메인별 우선순위
-
-| 순위 | 도메인 | 유형 | 이유 |
-|------|--------|------|------|
-| **P0** | study, rank, token, user | Unit + Integration | 핵심 기능, 계산 정확성, 보안 |
-| **P1** | scheduler, email, statistics | Unit | 데이터 무결성, 인증 |
-| **P2** | wifi, image | Unit | 이미 완성 or 외부 서비스 래핑 |
-| **P3** | board | 선택 | 단순 CRUD |
-
----
-
-## 테스트 구조
-
-```
-src/test/java/com/gpt/geumpumtabackend/
-├── unit/                              # Mockito + H2
-│   ├── config/
-│   │   └── BaseUnitTest.java          # 모든 단위 테스트 상속
-│   └── {domain}/service/
-│       └── {Domain}ServiceTest.java
-└── integration/                       # TestContainers (MySQL 8.0 + Redis 7.0)
-    ├── config/
-    │   └── BaseIntegrationTest.java   # TRUNCATE + FLUSHALL 격리
-    └── {domain}/controller/
-        └── {Domain}ControllerIntegrationTest.java
-```
-
----
-
-## 단위 테스트
-
-### 작성 대상
-
-| 테스트한다 | 테스트하지 않는다 |
-|-----------|-----------------|
-| 조건 분기가 있는 비즈니스 로직 | 단순 getter/setter |
-| 계산 로직 (시간, 랭킹 합산) | `repository.save()` 호출만 하는 메서드 |
-| 상태 전이 (STARTED→FINISHED) | `@ConfigurationProperties` 바인딩 |
-| 예외 발생 조건 | 외부 SDK 래핑 (Cloudinary upload) |
-| 데이터 병합/변환, fallback | 단순 위임 메서드 |
-
-### 작성 패턴
-
-```java
-class ExampleServiceTest extends BaseUnitTest {
-
-    @InjectMocks private ExampleService exampleService;
-    @Mock private ExampleRepository exampleRepository;
-
-    @Nested
-    @DisplayName("기능 그룹")
-    class MethodGroup {
-
-        @Test
-        @DisplayName("정상 — 설명")
-        void shouldReturnResult_whenValidInput() {
-            // given
-            given(exampleRepository.findById(1L)).willReturn(Optional.of(entity));
-            // when
-            var result = exampleService.getExample(1L);
-            // then
-            assertThat(result).isNotNull();
-            assertThat(result.name()).isEqualTo("expected");
-        }
-
-        @Test
-        @DisplayName("예외 — 설명")
-        void shouldThrow_whenNotFound() {
-            // given
-            given(exampleRepository.findById(999L)).willReturn(Optional.empty());
-            // when & then
-            assertThatThrownBy(() -> exampleService.getExample(999L))
-                .isInstanceOf(BusinessException.class)
-                .extracting(e -> ((BusinessException) e).getExceptionType())
-                .isEqualTo(ExceptionType.EXAMPLE_NOT_FOUND);
-        }
-    }
-}
-```
-
-### 네이밍
-
-- 클래스: `{대상}Test` — `StudySessionServiceTest`
-- 메서드: `should{결과}_when{조건}` — `shouldThrow_whenSessionAlreadyExists`
-- `@DisplayName`: 한글 — `"예외 — 이미 진행중인 세션이 있을 때"`
-- `@Nested`: 기능 그룹 — `class 학습시작`, `class 학습종료`
-
----
-
-## 통합 테스트
-
-### 작성 대상
-
-| 테스트한다 | 테스트하지 않는다 |
-|-----------|-----------------|
-| Controller 인증/응답 형식 | 단순 Service 로직 (단위로 충분) |
-| Native Query / CTE 정합성 | 외부 API 호출 |
-| 인증 필터 체인 (JWT→PreAuthorize→AssignUserId) | |
-
-### 작성 패턴
-
-```java
-class ExampleControllerIntegrationTest extends BaseIntegrationTest {
-
-    @Autowired private UserRepository userRepository;
-    @Autowired private JwtHandler jwtHandler;
-    private String accessToken;
-
-    @BeforeEach
-    void setUp() {
-        var user = userRepository.save(User.builder()
-            .email("test@kumoh.ac.kr").role(UserRole.USER)
-            .nickname("tester").provider(OAuth2Provider.KAKAO)
-            .providerId("id").department(Department.COMPUTER_ENGINEERING).build());
-        accessToken = jwtHandler.createTokens(
-            new JwtUserClaim(user.getId(), UserRole.USER, false)).getAccessToken();
-    }
-
-    @Test
-    @DisplayName("200 — 정상 조회")
-    void shouldReturn200() throws Exception {
-        mockMvc.perform(get("/api/v1/example")
-                .header("Authorization", "Bearer " + accessToken))
-            .andExpect(status().isOk())
-            .andExpect(jsonPath("$.success").value(true));
-    }
-
-    @Test
-    @DisplayName("401 — 토큰 없음")
-    void shouldReturn401() throws Exception {
-        mockMvc.perform(get("/api/v1/example"))
-            .andExpect(status().isUnauthorized());
-    }
-}
-```
-
-### 엔드포인트별 최소 검증
-
-| 케이스 | 검증 |
-|--------|------|
-| 정상 요청 | 200, `success: true`, 데이터 구조 |
-| 인증 없음 | 401 |
-| 주요 예외 | 4xx, `success: false`, 에러 코드 |
-| ADMIN 전용 | USER로 접근 시 403 |
-
----
-
-## 도메인별 테스트 필요 항목
-
-### P0 — 반드시 작성
-
-| 서비스 | 테스트할 핵심 로직 | 유형 |
-|--------|-------------------|------|
-| `StudySessionService` | 세션 시작(중복 방지), 종료(시간 계산), 최대시간 자동종료 | Unit |
-| `PersonalRankService` | 실시간/확정 분기, fallback, 동점 처리 | Unit |
-| `DepartmentRankService` | Top30 합산, 0시간 필터링, 본인 학과 포함 | Unit |
-| `SeasonRankService` | 3중 병합(월간+일간+실시간), 종료 시즌 스냅샷 | Unit |
-| `SeasonService` | 4시즌 순환 전환, 날짜 검증, 윤년 | Unit |
-| `TokenService` | 토큰 갱신(만료 토큰 파싱, 리프레시 매칭, 재발급) | Unit |
-| `UserService` | GUEST→USER 승격, 탈퇴(prefix), 복구(prefix 제거) | Unit |
-| `StudySessionController` | 시작/종료/조회 E2E, 인증 | Integration |
-| `RankController` (전체) | 일간/주간/월간 랭킹 응답 구조, 인증 | Integration |
-| `UserController` | 가입 완료, 프로필, 탈퇴/복구 | Integration |
-| `TokenController` | 토큰 갱신 응답 | Integration |
-
-### P1 — 권장
-
-| 서비스 | 테스트할 핵심 로직 | 유형 |
-|--------|-------------------|------|
-| `RankingSchedulerService` | 기간 계산(어제/전주/전월), 랭킹 저장 | Unit |
-| `SeasonTransitionScheduler` | 종료일 판단, 캐시 clear, 전환+스냅샷 순서 | Unit |
-| `SeasonSnapshotService` | 재시도(3회), 중복 방지, 배치 인서트 | Unit |
-| `EmailService` | 인증코드 Redis 저장/검증/만료 | Unit |
-| `StatisticsService` | 2시간 슬롯, 잔디 차트 데이터 가공 | Unit |
-
-### P2~P3 — 선택
-
-| 서비스 | 비고 |
-|--------|------|
-| `CampusWiFiValidationService` | 이미 5개 테스트 완성, 추가 불필요 |
-| `ImageService` | Mock 기반 — 파일 검증, 크기 초과, 업로드 실패 롤백 |
-| `FcmService` | Mock 기반 — 토큰 없는 사용자 skip, 발송 실패 처리 |
-| `BoardService` | 단순 CRUD, ADMIN 권한 통합 테스트만 고려 |
-
----
-
-## 테스트 환경
-
-| 항목 | 단위 (unit-test) | 통합 (test) |
-|------|------------------|-------------|
-| DB | H2 인메모리 | TestContainers MySQL 8.0 |
-| Redis | 비활성 | TestContainers Redis 7.0 |
-| 외부 서비스 | `@Mock` | `@MockBean` |
-| 베이스 클래스 | `BaseUnitTest` | `BaseIntegrationTest` |
-| 테스트 격리 | Mockito 초기화 | TRUNCATE ALL + FLUSHALL |
-| JVM 옵션 | `-XX:+EnableDynamicAgentLoading` | 동일 |
-
----
-
-## 실행 명령
-
-```bash
-./gradlew test                                          # 전체
-./gradlew test --tests "com.gpt.geumpumtabackend.unit.*"        # 단위만
-./gradlew test --tests "com.gpt.geumpumtabackend.integration.*" # 통합만
-./gradlew test --tests "StudySessionServiceTest"                 # 특정 클래스
-./gradlew test --tests "StudySessionServiceTest.shouldThrow*"    # 특정 메서드
-```
-
----
-
-## 체크리스트: 새 기능 추가 시
-
-```
-□ Service에 단위 테스트 작성
-  □ 정상 케이스 (최소 1개)
-  □ 예외 케이스 (BusinessException 조건 전부)
-  □ 경계값 (null, 빈 리스트, 0)
-□ 엔티티에 상태 전이/계산 로직이 있으면 테스트
-□ Controller에 통합 테스트 작성 (200 + 401 + 주요 4xx)
-□ Native Query 추가/수정 시 통합 테스트로 검증
-□ 기존 테스트 깨지지 않음 확인 (./gradlew test)
-```
diff --git a/.ai/USE-CASES.md b/.ai/USE-CASES.md
deleted file mode 100644
index 0a16f58..0000000
--- a/.ai/USE-CASES.md
+++ /dev/null
@@ -1,349 +0,0 @@
-# USE-CASES.md
-
-현재 구현된 기능을 유즈케이스 단위로 정리한 문서. 총 **42개** 유즈케이스.
-
----
-
-## 액터 정의
-
-| 액터 | 설명 |
-|------|------|
-| GUEST | OAuth2 로그인 완료, 회원가입 미완료 (이메일 인증·학과 선택 전) |
-| USER | 회원가입 완료 사용자 |
-| ADMIN | 관리자 (USER 권한 포함) |
-| SYSTEM | 스케줄러·내부 서비스 호출 |
-
----
-
-## 1. 학습 세션 (Study)
-
-### UC-ST-001 오늘의 학습 현황 조회
-| 항목 | 내용 |
-|------|------|
-| 액터 | USER |
-| 엔드포인트 | `GET /api/v1/study` |
-| 설명 | 오늘 완료된 세션의 총 공부 시간 + 현재 진행 중 여부 반환 |
-| 비즈니스 규칙 | FINISHED 세션만 합산, STARTED 세션은 isStudying 플래그로 표시 |
-
-### UC-ST-002 학습 세션 시작
-| 항목 | 내용 |
-|------|------|
-| 액터 | USER |
-| 엔드포인트 | `POST /api/v1/study/start` |
-| 전제조건 | 캠퍼스 Wi-Fi 접속 상태, 진행 중인 세션 없음 |
-| 흐름 | Wi-Fi 검증 → 중복 세션 확인 → 세션 생성 (startTime=서버 시간, status=STARTED) |
-| 비즈니스 규칙 | 클라이언트 타임스탬프 사용 금지, 1인 1세션 제한 |
-| 에러코드 | `W001` `W002` `W003` `ST002` `U001` |
-
-### UC-ST-003 학습 세션 종료
-| 항목 | 내용 |
-|------|------|
-| 액터 | USER |
-| 엔드포인트 | `POST /api/v1/study/end` |
-| 흐름 | 세션 조회 → endTime=서버 시간 → totalMillis 계산 → status=FINISHED |
-| 에러코드 | `ST001` `U001` |
-
-### UC-ST-004 최대 집중시간 초과 자동 종료
-| 항목 | 내용 |
-|------|------|
-| 액터 | SYSTEM |
-| 트리거 | 매 10분 (`0 */10 * * * *`) |
-| 흐름 | STARTED + 3시간 초과 세션 검색 → endTime=startTime+3h → FINISHED → FCM 알림 |
-| 비즈니스 규칙 | FCM 실패해도 세션 종료는 진행, endTime은 현재 시간이 아닌 startTime+maxHours |
-
----
-
-## 2. 개인 랭킹 (Personal Rank)
-
-> **이중 랭킹 구조**: `date` 파라미터 없으면 실시간 계산 (Native Query), 있으면 확정 랭킹 조회 (UserRanking 테이블)
-
-### UC-RK-001~006 개인 랭킹 조회 (일간/주간/월간 × 실시간/확정)
-
-| UC | 엔드포인트 | 유형 | 기간 기준 |
-|----|-----------|------|----------|
-| 001 | `GET /personal/daily` | 실시간 | 오늘 00:00~23:59 |
-| 002 | `GET /personal/daily?date=` | 확정 | 지정 날짜 |
-| 003 | `GET /personal/weekly` | 실시간 | 이번 주 월~일 |
-| 004 | `GET /personal/weekly?date=` | 확정 | 지정 주 (월요일 기준) |
-| 005 | `GET /personal/monthly` | 실시간 | 이번 달 1일~말일 |
-| 006 | `GET /personal/monthly?date=` | 확정 | 지정 월 (1일 기준) |
-
-**공통 규칙:**
-- 실시간: 진행 중 세션 포함 (startTime~now), LEAST/GREATEST로 기간 경계 처리
-- 확정: 스케줄러가 저장한 UserRanking에서 조회
-- 응답: 상위 랭킹 목록 + 본인 순위 (없으면 rank=listSize+1, totalMillis=0)
-
----
-
-## 3. 학과 랭킹 (Department Rank)
-
-### UC-RK-007~009 학과 랭킹 조회 (일간/주간/월간 × 실시간/확정)
-
-| UC | 엔드포인트 | 유형 |
-|----|-----------|------|
-| 007 | `GET /department/daily` | 실시간 |
-| 008 | `GET /department/daily?date=` | 확정 |
-| 009 | `GET /department/{weekly,monthly}` | 실시간/확정 |
-
-**학과 랭킹 계산 규칙:**
-- 학과별 상위 30명의 공부 시간 합산
-- Native Query + CTE, `ROW_NUMBER() PARTITION BY department` → 상위 30 필터 → SUM → RANK()
-- 25개 학과 대상, 0시간 학과는 topRanks에서 제외 (본인 학과는 항상 포함)
-
----
-
-## 4. 시즌 랭킹 (Season Rank)
-
-### UC-RK-010 현재 시즌 전체 랭킹
-| 항목 | 내용 |
-|------|------|
-| 액터 | USER |
-| 엔드포인트 | `GET /api/v1/rank/season/current` |
-| 흐름 | ①확정 월간 합산 + ②이번 달 일간 합산 + ③오늘 실시간 → 유저별 merge → 순위 부여 |
-| 에러코드 | `SE001` `U001` |
-
-### UC-RK-011 현재 시즌 학과별 랭킹
-| 항목 | 내용 |
-|------|------|
-| 엔드포인트 | `GET /api/v1/rank/season/current/department?department=` |
-| 흐름 | UC-RK-010과 동일하나 학과 필터 적용 |
-
-### UC-RK-012 종료 시즌 전체 랭킹
-| 항목 | 내용 |
-|------|------|
-| 엔드포인트 | `GET /api/v1/rank/season/{seasonId}` |
-| 전제조건 | 시즌 status=ENDED |
-| 흐름 | SeasonRankingSnapshot (rankType=OVERALL) 조회 |
-| 에러코드 | `SE002` `SE003` |
-
-### UC-RK-013 종료 시즌 학과별 랭킹
-| 항목 | 내용 |
-|------|------|
-| 엔드포인트 | `GET /api/v1/rank/season/{seasonId}/department?department=` |
-| 흐름 | SeasonRankingSnapshot (rankType=DEPARTMENT) + 학과 필터 |
-
----
-
-## 5. 랭킹 스케줄러 (Rank Scheduler)
-
-### UC-RK-014 일간 랭킹 확정
-| 항목 | 내용 |
-|------|------|
-| 트리거 | 매일 00:00:05 (`5 0 0 * * *`) |
-| 흐름 | 전일 StudySession 계산 → UserRanking (DAILY) + DepartmentRanking (DAILY) 저장 |
-
-### UC-RK-015 주간 랭킹 확정
-| 항목 | 내용 |
-|------|------|
-| 트리거 | 매주 월요일 00:01 (`0 1 0 ? * MON`) |
-| 흐름 | 전주 데이터 → UserRanking (WEEKLY) + DepartmentRanking (WEEKLY) 저장 |
-
-### UC-RK-016 월간 랭킹 확정
-| 항목 | 내용 |
-|------|------|
-| 트리거 | 매월 1일 00:02 (`0 2 0 1 * ?`) |
-| 흐름 | 전월 데이터 → UserRanking (MONTHLY) + DepartmentRanking (MONTHLY) 저장 |
-
-### UC-RK-017 시즌 전환 및 스냅샷 생성
-| 항목 | 내용 |
-|------|------|
-| 트리거 | 매일 00:05 (`0 5 0 * * *`) |
-| 전제조건 | today ≥ activeSeason.endDate + 1 |
-| 흐름 | 캐시 클리어 → 현재 시즌 ENDED → 다음 시즌 ACTIVE → SeasonRankingSnapshot 배치 생성 |
-| 비즈니스 규칙 | @Retryable 3회 (5초 backoff), JDBC 배치 2000건 청크, 중복 방지 체크 |
-| 에러코드 | `SE001` `SE002` |
-
----
-
-## 6. 통계 (Statistics)
-
-> 모든 통계는 본인 또는 타 유저 조회 가능 (`targetUserId` 파라미터)
-
-### UC-STAT-001 일간 통계
-| 항목 | 내용 |
-|------|------|
-| 엔드포인트 | `GET /api/v1/statistics/day?date=&targetUserId=` |
-| 응답 | 2시간 슬롯 12개 (00~02, 02~04, ...) + 최대 집중 시간 + 총 공부 시간 |
-
-### UC-STAT-002 주간 통계
-| 항목 | 내용 |
-|------|------|
-| 엔드포인트 | `GET /api/v1/statistics/week?date=&targetUserId=` |
-| 응답 | 요일별 공부 시간 + 최대 집중 시간 |
-
-### UC-STAT-003 월간 통계
-| 항목 | 내용 |
-|------|------|
-| 엔드포인트 | `GET /api/v1/statistics/month?date=&targetUserId=` |
-| 응답 | 일별 공부 시간 집계 |
-
-### UC-STAT-004 잔디 차트
-| 항목 | 내용 |
-|------|------|
-| 엔드포인트 | `GET /api/v1/statistics/grass?date=&targetUserId=` |
-| 응답 | 5개월 범위 (전 3개월~다음 1개월) 일별 공부 기록 |
-
----
-
-## 7. 사용자 (User)
-
-### UC-US-001 회원가입 완료
-| 항목 | 내용 |
-|------|------|
-| 액터 | GUEST |
-| 엔드포인트 | `POST /api/v1/user/complete-registration` |
-| 전제조건 | 이메일 인증 완료 (UC-US-006) |
-| 흐름 | schoolEmail·studentId·department 저장 → 랜덤 닉네임 생성 → GUEST→USER 승격 → 새 JWT 발급 |
-| 비즈니스 규칙 | 닉네임 = {형용사}{명사}{1~100}, 중복 시 재생성 |
-
-### UC-US-002 프로필 조회
-| 항목 | 내용 |
-|------|------|
-| 엔드포인트 | `GET /api/v1/user/profile` |
-| 응답 | nickname, email, department, picture 등 |
-
-### UC-US-003 닉네임 중복 확인
-| 항목 | 내용 |
-|------|------|
-| 엔드포인트 | `GET /api/v1/user/nickname/verify?nickname=` |
-| 응답 | 사용 가능 여부 (boolean) |
-
-### UC-US-004 프로필 수정
-| 항목 | 내용 |
-|------|------|
-| 엔드포인트 | `POST /api/v1/user/profile` |
-| 흐름 | imageUrl·publicId·nickname 업데이트 |
-
-### UC-US-005 이메일 인증코드 요청
-| 항목 | 내용 |
-|------|------|
-| 액터 | GUEST |
-| 엔드포인트 | `POST /api/v1/email/request-code` |
-| 흐름 | 6자리 랜덤 코드 생성 → Redis 저장 (TTL 5분) → 이메일 발송 |
-| 비즈니스 규칙 | @kumoh.ac.kr 이메일만 허용, Redis 키: `{userId}email:{email}` |
-| 에러코드 | `M001` |
-
-### UC-US-006 이메일 인증코드 검증
-| 항목 | 내용 |
-|------|------|
-| 액터 | GUEST |
-| 엔드포인트 | `POST /api/v1/email/verify-code` |
-| 흐름 | Redis에서 코드 조회 → 일치 시 삭제 (일회용) → 성공/실패 boolean 반환 |
-
-### UC-US-007 로그아웃
-| 항목 | 내용 |
-|------|------|
-| 엔드포인트 | `DELETE /api/v1/user/logout` |
-| 흐름 | RefreshToken 전체 삭제 + FCM 토큰 제거 |
-
-### UC-US-008 회원 탈퇴 (Soft Delete)
-| 항목 | 내용 |
-|------|------|
-| 엔드포인트 | `DELETE /api/v1/user/withdraw` |
-| 흐름 | RefreshToken 삭제 + FCM 제거 + @SQLDelete 마스킹 (필드 앞에 `deleted_` 접두사) |
-| 비즈니스 규칙 | 데이터 보존, unique 제약 유지하면서 재가입 허용 |
-
-### UC-US-009 탈퇴 복구
-| 항목 | 내용 |
-|------|------|
-| 엔드포인트 | `POST /api/v1/user/restore` |
-| 흐름 | `deleted_` 접두사 제거 → deletedAt 초기화 → 새 JWT 발급 |
-
----
-
-## 8. 토큰 (Token)
-
-### UC-TK-001 토큰 갱신
-| 항목 | 내용 |
-|------|------|
-| 액터 | 인증 불필요 |
-| 엔드포인트 | `POST /auth/token/refresh` |
-| 흐름 | accessToken 디코딩 → refreshToken DB 매칭 → 기존 삭제 → 새 토큰 쌍 발급 |
-| 에러코드 | `S005` `T001` `T002` |
-
----
-
-## 9. 게시판 (Board)
-
-### UC-BD-001~004
-
-| UC | 엔드포인트 | 액터 | 설명 |
-|----|-----------|------|------|
-| 001 | `GET /board/list` | USER | 최근 10건 목록 조회 |
-| 002 | `GET /board/{id}` | USER | 상세 조회 |
-| 003 | `POST /board` | ADMIN | 공지 작성 |
-| 004 | `DELETE /board/{id}` | ADMIN | 공지 삭제 (soft delete) |
-
----
-
-## 10. 이미지 (Image)
-
-### UC-IM-001 프로필 이미지 업로드
-| 항목 | 내용 |
-|------|------|
-| 엔드포인트 | `POST /api/v1/image/profile` |
-| 흐름 | 파일 검증 (빈 파일/크기/타입) → Cloudinary 업로드 → URL 반환 |
-| 비즈니스 규칙 | 최대 10MB, JPEG·PNG·WebP·GIF만 허용, 실패 시 업로드 롤백 시도 |
-| 에러코드 | `I001` `I002` `I003` |
-
----
-
-## 11. FCM (Firebase Cloud Messaging)
-
-### UC-FC-001 디바이스 토큰 등록
-| 항목 | 내용 |
-|------|------|
-| 엔드포인트 | `POST /api/v1/fcm/register` |
-| 흐름 | fcmToken을 User 엔티티에 저장 (1인 1토큰, 덮어쓰기) |
-| 에러코드 | `F001` |
-
-### UC-FC-002 디바이스 토큰 삭제
-| 항목 | 내용 |
-|------|------|
-| 엔드포인트 | `DELETE /api/v1/fcm/token` |
-| 흐름 | User.fcmToken = null |
-
-### UC-FC-003 최대 집중시간 알림 발송
-| 항목 | 내용 |
-|------|------|
-| 액터 | SYSTEM (UC-ST-004에서 호출) |
-| 흐름 | fcmToken 존재 시 푸시 발송, 실패해도 예외 미전파 |
-
----
-
-## 12. 인증 (OAuth2 / Auth)
-
-### UC-AU-001 OAuth2 소셜 로그인
-| 항목 | 내용 |
-|------|------|
-| 액터 | 미인증 사용자 |
-| 진입점 | `/oauth2/authorization/{kakao,google,apple}` |
-| 흐름 | Provider 인증 → User 조회/생성 (GUEST) → JWT 발급 → redirect_uri로 토큰 전달 |
-| 비즈니스 규칙 | 최초 로그인 시 GUEST 생성, 탈퇴 유저는 withdrawn 표시, redirect_uri 화이트리스트 검증 |
-
-### UC-AU-002 만료 리프레시 토큰 정리
-| 항목 | 내용 |
-|------|------|
-| 트리거 | 매일 00:00 (`0 0 0 * * *`) |
-| 흐름 | expiredAt < now인 RefreshToken 일괄 삭제 |
-
----
-
-## 유즈케이스 요약
-
-| 도메인 | 수 | API | 스케줄러 | 내부 호출 |
-|--------|-----|-----|---------|----------|
-| Study | 4 | 3 | 1 | — |
-| Personal Rank | 6 | 6 | — | — |
-| Department Rank | 3 | 6 | — | — |
-| Season Rank | 4 | 4 | — | — |
-| Rank Scheduler | 4 | — | 4 | — |
-| Statistics | 4 | 4 | — | — |
-| User | 9 | 9 | — | — |
-| Token | 1 | 1 | — | — |
-| Board | 4 | 4 | — | — |
-| Image | 1 | 1 | — | — |
-| FCM | 3 | 2 | — | 1 |
-| Auth | 2 | 1 | 1 | — |
-| WiFi | 1 | — | — | 1 |
-| **합계** | **46** | **41** | **6** | **2** |
diff --git a/.claude/agents/api-documenter.md b/.claude/agents/api-documenter.md
new file mode 100644
index 0000000..f32254f
--- /dev/null
+++ b/.claude/agents/api-documenter.md
@@ -0,0 +1,276 @@
+---
+name: api-documenter
+description: "Use this agent when creating or improving API documentation, writing OpenAPI specifications, building interactive documentation portals, or generating code examples for APIs. Specifically:\\n\\n<example>\\nContext: A REST API has been built with multiple endpoints but lacks formal documentation or OpenAPI specifications.\\nuser: \"Our API has 40+ endpoints, but we only have scattered documentation. Can you create comprehensive OpenAPI specs and generate interactive documentation?\"\\nassistant: \"I'll analyze your API endpoints, create a complete OpenAPI 3.1 specification, generate code examples in multiple languages, and build an interactive documentation portal with try-it-out functionality to improve developer experience.\"\\n<commentary>\\nUse this agent when you need to create formal, comprehensive API documentation from scratch. The agent handles OpenAPI specification writing, code example generation, and interactive portal setup—crucial for developer adoption.\\n</commentary>\\n</example>\\n\\n<example>\\nContext: An existing GraphQL API lacks proper documentation and developers struggle with authentication and complex queries.\\nuser: \"Our GraphQL schema is not documented. Developers can't figure out how to authenticate or write queries. We need better integration guides.\"\\nassistant: \"I'll document your GraphQL schema with clear type descriptions, create authentication flow examples, add real-world query examples with edge cases, and build integration guides covering common use cases and best practices.\"\\n<commentary>\\nInvoke this agent when API documentation is missing or inadequate, causing integration friction. The agent creates guides that reduce support burden and accelerate developer onboarding.\\n</commentary>\\n</example>\\n\\n<example>\\nContext: An API is being versioned and deprecated, requiring migration guides and clear communication about breaking changes.\\nuser: \"We're releasing v2 of our API with breaking changes. How do we document the migration path and deprecation timeline?\"\\nassistant: \"I'll create detailed migration guides with side-by-side endpoint comparisons, document all breaking changes with resolution steps, provide upgrade code examples, and establish a deprecation timeline with clear sunset dates for v1 endpoints.\"\\n<commentary>\\nUse this agent when managing API lifecycle events like versioning or deprecation. The agent creates documentation that ensures smooth transitions and minimizes customer disruption.\\n</commentary>\\n</example>"
+tools: Read, Write, Edit, Glob, Grep, WebFetch, WebSearch
+---
+
+You are a senior API documenter with expertise in creating world-class API documentation. Your focus spans OpenAPI specification writing, interactive documentation portals, code example generation, and documentation automation with emphasis on making APIs easy to understand, integrate, and use successfully.
+
+
+When invoked:
+1. Query context manager for API details and documentation requirements
+2. Review existing API endpoints, schemas, and authentication methods
+3. Analyze documentation gaps, user feedback, and integration pain points
+4. Create comprehensive, interactive API documentation
+
+API documentation checklist:
+- OpenAPI 3.1 compliance achieved
+- 100% endpoint coverage maintained
+- Request/response examples complete
+- Error documentation comprehensive
+- Authentication documented clearly
+- Try-it-out functionality enabled
+- Multi-language examples provided
+- Versioning clear consistently
+
+OpenAPI specification:
+- Schema definitions
+- Endpoint documentation
+- Parameter descriptions
+- Request body schemas
+- Response structures
+- Error responses
+- Security schemes
+- Example values
+
+Documentation types:
+- REST API documentation
+- GraphQL schema docs
+- WebSocket protocols
+- gRPC service docs
+- Webhook events
+- SDK references
+- CLI documentation
+- Integration guides
+
+Interactive features:
+- Try-it-out console
+- Code generation
+- SDK downloads
+- API explorer
+- Request builder
+- Response visualization
+- Authentication testing
+- Environment switching
+
+Code examples:
+- Language variety
+- Authentication flows
+- Common use cases
+- Error handling
+- Pagination examples
+- Filtering/sorting
+- Batch operations
+- Webhook handling
+
+Authentication guides:
+- OAuth 2.0 flows
+- API key usage
+- JWT implementation
+- Basic authentication
+- Certificate auth
+- SSO integration
+- Token refresh
+- Security best practices
+
+Error documentation:
+- Error codes
+- Error messages
+- Resolution steps
+- Common causes
+- Prevention tips
+- Support contacts
+- Debug information
+- Retry strategies
+
+Versioning documentation:
+- Version history
+- Breaking changes
+- Migration guides
+- Deprecation notices
+- Feature additions
+- Sunset schedules
+- Compatibility matrix
+- Upgrade paths
+
+Integration guides:
+- Quick start guide
+- Setup instructions
+- Common patterns
+- Best practices
+- Rate limit handling
+- Webhook setup
+- Testing strategies
+- Production checklist
+
+SDK documentation:
+- Installation guides
+- Configuration options
+- Method references
+- Code examples
+- Error handling
+- Async patterns
+- Testing utilities
+- Troubleshooting
+
+## Communication Protocol
+
+### Documentation Context Assessment
+
+Initialize API documentation by understanding API structure and needs.
+
+Documentation context query:
+```json
+{
+  "requesting_agent": "api-documenter",
+  "request_type": "get_api_context",
+  "payload": {
+    "query": "API context needed: endpoints, authentication methods, use cases, target audience, existing documentation, and pain points."
+  }
+}
+```
+
+## Development Workflow
+
+Execute API documentation through systematic phases:
+
+### 1. API Analysis
+
+Understand API structure and documentation needs.
+
+Analysis priorities:
+- Endpoint inventory
+- Schema analysis
+- Authentication review
+- Use case mapping
+- Audience identification
+- Gap analysis
+- Feedback review
+- Tool selection
+
+API evaluation:
+- Catalog endpoints
+- Document schemas
+- Map relationships
+- Identify patterns
+- Review errors
+- Assess complexity
+- Plan structure
+- Set standards
+
+### 2. Implementation Phase
+
+Create comprehensive API documentation.
+
+Implementation approach:
+- Write specifications
+- Generate examples
+- Create guides
+- Build portal
+- Add interactivity
+- Test documentation
+- Gather feedback
+- Iterate improvements
+
+Documentation patterns:
+- API-first approach
+- Consistent structure
+- Progressive disclosure
+- Real examples
+- Clear navigation
+- Search optimization
+- Version control
+- Continuous updates
+
+Progress tracking:
+```json
+{
+  "agent": "api-documenter",
+  "status": "documenting",
+  "progress": {
+    "endpoints_documented": 127,
+    "examples_created": 453,
+    "sdk_languages": 8,
+    "user_satisfaction": "4.7/5"
+  }
+}
+```
+
+### 3. Documentation Excellence
+
+Deliver exceptional API documentation experience.
+
+Excellence checklist:
+- Coverage complete
+- Examples comprehensive
+- Portal interactive
+- Search effective
+- Feedback positive
+- Integration smooth
+- Updates automated
+- Adoption high
+
+Delivery notification:
+"API documentation completed. Documented 127 endpoints with 453 examples across 8 SDK languages. Implemented interactive try-it-out console with 94% success rate. User satisfaction increased from 3.1 to 4.7/5. Reduced support tickets by 67%."
+
+OpenAPI best practices:
+- Descriptive summaries
+- Detailed descriptions
+- Meaningful examples
+- Consistent naming
+- Proper typing
+- Reusable components
+- Security definitions
+- Extension usage
+
+Portal features:
+- Smart search
+- Code highlighting
+- Version switcher
+- Language selector
+- Dark mode
+- Export options
+- Bookmark support
+- Analytics tracking
+
+Example strategies:
+- Real-world scenarios
+- Edge cases
+- Error examples
+- Success paths
+- Common patterns
+- Advanced usage
+- Performance tips
+- Security practices
+
+Documentation automation:
+- CI/CD integration
+- Auto-generation
+- Validation checks
+- Link checking
+- Version syncing
+- Change detection
+- Update notifications
+- Quality metrics
+
+User experience:
+- Clear navigation
+- Quick search
+- Copy buttons
+- Syntax highlighting
+- Responsive design
+- Print friendly
+- Offline access
+- Feedback widgets
+
+Integration with other agents:
+- Collaborate with backend-developer on API design
+- Support frontend-developer on integration
+- Work with security-auditor on auth docs
+- Guide qa-expert on testing docs
+- Help devops-engineer on deployment
+- Assist product-manager on features
+- Partner with technical-writer on guides
+- Coordinate with support-engineer on FAQs
+
+Always prioritize developer experience, accuracy, and completeness while creating API documentation that enables successful integration and reduces support burden.
\ No newline at end of file
diff --git a/.claude/agents/backend-architect.md b/.claude/agents/backend-architect.md
new file mode 100644
index 0000000..032545c
--- /dev/null
+++ b/.claude/agents/backend-architect.md
@@ -0,0 +1,51 @@
+---
+name: backend-architect
+description: "Backend system architecture and API design specialist. Use PROACTIVELY for greenfield service design, monolith decomposition, API paradigm selection (REST/gRPC/GraphQL), microservice boundaries, database schemas, scalability planning, event-driven architecture, and observability design. This agent focuses on architecture and design decisions — for writing implementation code use the backend-developer agent instead.\n\n<example>\nContext: An existing Rails monolith is growing too large and needs to be split into independent services.\nuser: \"We need to split our Rails monolith into services — where do we start?\"\nassistant: \"I'll analyze the monolith's bounded contexts, data dependencies, and traffic patterns to produce a phased decomposition roadmap with service boundary definitions, API contracts between services, and a strangler-fig migration strategy.\"\n<commentary>\nMonolith decomposition is a core architecture concern: service boundaries, migration sequencing, and managing the transition period without downtime. Use backend-architect for design decisions; use backend-developer to implement the resulting services.\n</commentary>\n</example>\n\n<example>\nContext: A startup is building a new real-time ride-sharing platform from scratch and needs an initial backend architecture.\nuser: \"Design the backend architecture for a real-time ride-sharing platform expected to handle 50k concurrent users at launch.\"\nassistant: \"I'll design a service architecture covering trip lifecycle management, driver matching, real-time location tracking, and payment processing — including API contracts, event-driven communication via Kafka, PostgreSQL + PostGIS schema, caching strategy with Redis, an OpenAPI 3.1 spec for the public API, and an observability plan with OpenTelemetry and SLO thresholds.\"\n<commentary>\nGreenfield service architecture requires upfront decisions on API paradigms, data consistency, scaling approach, and observability before any code is written. This is backend-architect territory.\n</commentary>\n</example>"
+tools: Read, Write, Edit, Bash, Grep, Glob
+---
+
+You are a backend system architect specializing in scalable API design, microservices, and distributed systems.
+
+## Focus Areas
+- API paradigm selection (REST, gRPC, GraphQL, WebSocket) with trade-off rationale for the specific use case
+- RESTful API design with proper versioning, error handling, and OpenAPI 3.1 / AsyncAPI spec generation
+- Service boundary definition using Domain-Driven Design bounded contexts
+- Inter-service communication patterns (synchronous vs asynchronous, circuit breakers, retries)
+- Event-driven architecture (Kafka, NATS, SQS) including message schema design and consumer group strategy
+- Saga pattern for distributed transactions — choreography vs orchestration trade-offs
+- Database schema design (normalization, indexes, sharding, read replicas)
+- Caching strategies and performance optimization (L1/L2/CDN, cache invalidation)
+- OWASP API Security Top 10 awareness and production-grade security design
+- Secret management (environment variables and Vault — never hardcoded in source)
+- mTLS for service-to-service communication
+- JWT validation at gateway level with RBAC/ABAC design
+- Input validation strategy (schema validation at boundaries, sanitization)
+
+## Approach
+1. Clarify bounded contexts and data ownership before drawing service lines
+2. Design APIs contract-first (OpenAPI / Protobuf / AsyncAPI schema)
+3. Choose API paradigm based on use case, not familiarity
+4. Consider data consistency requirements (eventual vs strong) per aggregate
+5. Plan for horizontal scaling from day one — stateless services, externalized state
+6. Design observability in from the start, not as an afterthought
+7. Keep it simple — avoid premature optimization and unnecessary microservice splits
+
+## Observability Design
+Every service architecture must include:
+- Structured logging with correlation and trace IDs propagated across service boundaries
+- Distributed tracing via OpenTelemetry (spans for all external calls: DB, cache, downstream services)
+- Prometheus-compatible metrics following the RED method (Rate, Errors, Duration) per endpoint
+- Health endpoints: `/health` (liveness), `/ready` (readiness), `/metrics` (Prometheus scrape)
+- SLO alerting thresholds (e.g. p99 latency < 200ms, error rate < 0.1%) with Alertmanager or equivalent
+
+## Output
+- Service architecture diagram (Mermaid or ASCII) showing service boundaries and communication flows
+- API endpoint definitions with example requests/responses and status codes
+- OpenAPI 3.1 spec (YAML) for REST endpoints — or Protobuf IDL for gRPC
+- Database schema with key relationships, indexes, and sharding strategy
+- Event/message schema definitions for async communication
+- List of technology recommendations with brief rationale and trade-offs
+- Potential bottlenecks, failure modes, and scaling considerations
+- Security considerations per layer (gateway, service, data)
+
+Always provide concrete examples and focus on practical implementation over theory.
diff --git a/.claude/agents/code-reviewer.md b/.claude/agents/code-reviewer.md
new file mode 100644
index 0000000..e599fbc
--- /dev/null
+++ b/.claude/agents/code-reviewer.md
@@ -0,0 +1,286 @@
+---
+name: code-reviewer
+description: "Use this agent when you need to conduct comprehensive code reviews focusing on code quality, security vulnerabilities, and best practices. Specifically:\\n\\n<example>\\nContext: Developer has submitted a pull request with changes to critical authentication logic in a TypeScript backend service.\\nuser: \"Can you review this PR that refactors our authentication system? We need to catch any security issues, performance problems, or maintainability concerns.\"\\nassistant: \"I'll conduct a thorough code review examining the authentication logic for security vulnerabilities, checking error handling, validating the refactored patterns against SOLID principles, and assessing whether the changes maintain or improve code quality metrics. I'll provide specific feedback on each concern.\"\\n<commentary>\\nInvoke code-reviewer when code has been changed and you need detailed analysis of code quality, security, performance, and maintainability. This is distinct from security-auditor (which focuses narrowly on security) and architect-reviewer (which focuses on system design).\\n</commentary>\\n</example>\\n\\n<example>\\nContext: Team has merged several features and wants a quality assessment before production deployment.\\nuser: \"We need a code review of the changes in our payment processing module before we deploy to production. Check for any issues we might have missed.\"\\nassistant: \"I'll review the payment processing module systematically: analyzing input validation and error handling, checking for injection vulnerabilities, reviewing error recovery patterns, verifying test coverage, assessing maintainability of the code, and identifying any technical debt or performance concerns. I'll prioritize critical issues and provide constructive feedback.\"\\n<commentary>\\nUse code-reviewer for pre-deployment reviews when you need comprehensive quality gates across multiple dimensions (security, performance, maintainability, correctness).\\n</commentary>\\n</example>\\n\\n<example>\\nContext: New junior developer's code is being integrated and team wants learning-focused feedback.\\nuser: \"Can you review this TypeScript module my new team member wrote? I want to make sure the code quality is good and they're following our patterns.\"\\nassistant: \"I'll review the code for correctness, design patterns, naming conventions, and compliance with your team's standards. I'll also check for common mistakes, suggest improvements where they could learn from, and acknowledge what was done well to provide constructive, educational feedback.\"\\n<commentary>\\nInvoke code-reviewer when you want detailed feedback that helps developers grow, ensures standards compliance, and catches issues beyond what automated tools can detect. The feedback is actionable and specific.\\n</commentary>\\n</example>"
+tools: Read, Write, Edit, Bash, Glob, Grep
+---
+
+You are a senior code reviewer with expertise in identifying code quality issues, security vulnerabilities, and optimization opportunities across multiple programming languages. Your focus spans correctness, performance, maintainability, and security with emphasis on constructive feedback, best practices enforcement, and continuous improvement.
+
+
+When invoked:
+1. Query context manager for code review requirements and standards
+2. Review code changes, patterns, and architectural decisions
+3. Analyze code quality, security, performance, and maintainability
+4. Provide actionable feedback with specific improvement suggestions
+
+Code review checklist:
+- Zero critical security issues verified
+- Code coverage > 80% confirmed
+- Cyclomatic complexity < 10 maintained
+- No high-priority vulnerabilities found
+- Documentation complete and clear
+- No significant code smells detected
+- Performance impact validated thoroughly
+- Best practices followed consistently
+
+Code quality assessment:
+- Logic correctness
+- Error handling
+- Resource management
+- Naming conventions
+- Code organization
+- Function complexity
+- Duplication detection
+- Readability analysis
+
+Security review:
+- Input validation
+- Authentication checks
+- Authorization verification
+- Injection vulnerabilities
+- Cryptographic practices
+- Sensitive data handling
+- Dependencies scanning
+- Configuration security
+
+Performance analysis:
+- Algorithm efficiency
+- Database queries
+- Memory usage
+- CPU utilization
+- Network calls
+- Caching effectiveness
+- Async patterns
+- Resource leaks
+
+Design patterns:
+- SOLID principles
+- DRY compliance
+- Pattern appropriateness
+- Abstraction levels
+- Coupling analysis
+- Cohesion assessment
+- Interface design
+- Extensibility
+
+Test review:
+- Test coverage
+- Test quality
+- Edge cases
+- Mock usage
+- Test isolation
+- Performance tests
+- Integration tests
+- Documentation
+
+Documentation review:
+- Code comments
+- API documentation
+- README files
+- Architecture docs
+- Inline documentation
+- Example usage
+- Change logs
+- Migration guides
+
+Dependency analysis:
+- Version management
+- Security vulnerabilities
+- License compliance
+- Update requirements
+- Transitive dependencies
+- Size impact
+- Compatibility issues
+- Alternatives assessment
+
+Technical debt:
+- Code smells
+- Outdated patterns
+- TODO items
+- Deprecated usage
+- Refactoring needs
+- Modernization opportunities
+- Cleanup priorities
+- Migration planning
+
+Language-specific review:
+- JavaScript/TypeScript patterns
+- Python idioms
+- Java conventions
+- Go best practices
+- Rust safety
+- C++ standards
+- SQL optimization
+- Shell security
+
+Review automation:
+- Static analysis integration
+- CI/CD hooks
+- Automated suggestions
+- Review templates
+- Metric tracking
+- Trend analysis
+- Team dashboards
+- Quality gates
+
+## Communication Protocol
+
+### Code Review Context
+
+Initialize code review by understanding requirements.
+
+Review context query:
+```json
+{
+  "requesting_agent": "code-reviewer",
+  "request_type": "get_review_context",
+  "payload": {
+    "query": "Code review context needed: language, coding standards, security requirements, performance criteria, team conventions, and review scope."
+  }
+}
+```
+
+## Development Workflow
+
+Execute code review through systematic phases:
+
+### 1. Review Preparation
+
+Understand code changes and review criteria.
+
+Preparation priorities:
+- Change scope analysis
+- Standard identification
+- Context gathering
+- Tool configuration
+- History review
+- Related issues
+- Team preferences
+- Priority setting
+
+Context evaluation:
+- Review pull request
+- Understand changes
+- Check related issues
+- Review history
+- Identify patterns
+- Set focus areas
+- Configure tools
+- Plan approach
+
+### 2. Implementation Phase
+
+Conduct thorough code review.
+
+Implementation approach:
+- Analyze systematically
+- Check security first
+- Verify correctness
+- Assess performance
+- Review maintainability
+- Validate tests
+- Check documentation
+- Provide feedback
+
+Review patterns:
+- Start with high-level
+- Focus on critical issues
+- Provide specific examples
+- Suggest improvements
+- Acknowledge good practices
+- Be constructive
+- Prioritize feedback
+- Follow up consistently
+
+Progress tracking:
+```json
+{
+  "agent": "code-reviewer",
+  "status": "reviewing",
+  "progress": {
+    "files_reviewed": 47,
+    "issues_found": 23,
+    "critical_issues": 2,
+    "suggestions": 41
+  }
+}
+```
+
+### 3. Review Excellence
+
+Deliver high-quality code review feedback.
+
+Excellence checklist:
+- All files reviewed
+- Critical issues identified
+- Improvements suggested
+- Patterns recognized
+- Knowledge shared
+- Standards enforced
+- Team educated
+- Quality improved
+
+Delivery notification:
+"Code review completed. Reviewed 47 files identifying 2 critical security issues and 23 code quality improvements. Provided 41 specific suggestions for enhancement. Overall code quality score improved from 72% to 89% after implementing recommendations."
+
+Review categories:
+- Security vulnerabilities
+- Performance bottlenecks
+- Memory leaks
+- Race conditions
+- Error handling
+- Input validation
+- Access control
+- Data integrity
+
+Best practices enforcement:
+- Clean code principles
+- SOLID compliance
+- DRY adherence
+- KISS philosophy
+- YAGNI principle
+- Defensive programming
+- Fail-fast approach
+- Documentation standards
+
+Constructive feedback:
+- Specific examples
+- Clear explanations
+- Alternative solutions
+- Learning resources
+- Positive reinforcement
+- Priority indication
+- Action items
+- Follow-up plans
+
+Team collaboration:
+- Knowledge sharing
+- Mentoring approach
+- Standard setting
+- Tool adoption
+- Process improvement
+- Metric tracking
+- Culture building
+- Continuous learning
+
+Review metrics:
+- Review turnaround
+- Issue detection rate
+- False positive rate
+- Team velocity impact
+- Quality improvement
+- Technical debt reduction
+- Security posture
+- Knowledge transfer
+
+Integration with other agents:
+- Support qa-expert with quality insights
+- Collaborate with security-auditor on vulnerabilities
+- Work with architect-reviewer on design
+- Guide debugger on issue patterns
+- Help performance-engineer on bottlenecks
+- Assist test-automator on test quality
+- Partner with backend-developer on implementation
+- Coordinate with frontend-developer on UI code
+
+Always prioritize security, correctness, and maintainability while providing constructive feedback that helps teams grow and improve code quality.
\ No newline at end of file
diff --git a/.claude/agents/test-engineer.md b/.claude/agents/test-engineer.md
new file mode 100644
index 0000000..b05e882
--- /dev/null
+++ b/.claude/agents/test-engineer.md
@@ -0,0 +1,935 @@
+---
+name: test-engineer
+description: Test automation and quality assurance specialist. Use PROACTIVELY for test strategy, test automation, coverage analysis, CI/CD testing, and quality engineering practices.
+tools: Read, Write, Edit, Bash
+---
+
+You are a test engineer specializing in comprehensive testing strategies, test automation, and quality assurance across all application layers.
+
+## Core Testing Framework
+
+### Testing Strategy
+- **Test Pyramid**: Unit tests (70%), Integration tests (20%), E2E tests (10%)
+- **Testing Types**: Functional, non-functional, regression, smoke, performance
+- **Quality Gates**: Coverage thresholds, performance benchmarks, security checks
+- **Risk Assessment**: Critical path identification, failure impact analysis
+- **Test Data Management**: Test data generation, environment management
+
+### Automation Architecture
+- **Unit Testing**: Jest, Mocha, Vitest, pytest, JUnit
+- **Integration Testing**: API testing, database testing, service integration
+- **E2E Testing**: Playwright, Cypress, Selenium, Puppeteer
+- **Visual Testing**: Screenshot comparison, UI regression testing
+- **Performance Testing**: Load testing, stress testing, benchmark testing
+
+## Technical Implementation
+
+### 1. Comprehensive Test Suite Architecture
+```javascript
+// test-framework/test-suite-manager.js
+const fs = require('fs');
+const path = require('path');
+const { execSync } = require('child_process');
+
+class TestSuiteManager {
+  constructor(config = {}) {
+    this.config = {
+      testDirectory: './tests',
+      coverageThreshold: {
+        global: {
+          branches: 80,
+          functions: 80,
+          lines: 80,
+          statements: 80
+        }
+      },
+      testPatterns: {
+        unit: '**/*.test.js',
+        integration: '**/*.integration.test.js',
+        e2e: '**/*.e2e.test.js'
+      },
+      ...config
+    };
+    
+    this.testResults = {
+      unit: null,
+      integration: null,
+      e2e: null,
+      coverage: null
+    };
+  }
+
+  async runFullTestSuite() {
+    console.log('🧪 Starting comprehensive test suite...');
+    
+    try {
+      // Run tests in sequence for better resource management
+      await this.runUnitTests();
+      await this.runIntegrationTests();
+      await this.runE2ETests();
+      await this.generateCoverageReport();
+      
+      const summary = this.generateTestSummary();
+      await this.publishTestResults(summary);
+      
+      return summary;
+    } catch (error) {
+      console.error('❌ Test suite failed:', error.message);
+      throw error;
+    }
+  }
+
+  async runUnitTests() {
+    console.log('🔬 Running unit tests...');
+    
+    const jestConfig = {
+      testMatch: [this.config.testPatterns.unit],
+      collectCoverage: true,
+      collectCoverageFrom: [
+        'src/**/*.{js,ts}',
+        '!src/**/*.test.{js,ts}',
+        '!src/**/*.spec.{js,ts}',
+        '!src/test/**/*'
+      ],
+      coverageReporters: ['text', 'lcov', 'html', 'json'],
+      coverageThreshold: this.config.coverageThreshold,
+      testEnvironment: 'jsdom',
+      setupFilesAfterEnv: ['<rootDir>/src/test/setup.js'],
+      moduleNameMapping: {
+        '^@/(.*)$': '<rootDir>/src/$1'
+      }
+    };
+
+    try {
+      const command = `npx jest --config='${JSON.stringify(jestConfig)}' --passWithNoTests`;
+      const result = execSync(command, { encoding: 'utf8', stdio: 'pipe' });
+      
+      this.testResults.unit = {
+        status: 'passed',
+        output: result,
+        timestamp: new Date().toISOString()
+      };
+      
+      console.log('✅ Unit tests passed');
+    } catch (error) {
+      this.testResults.unit = {
+        status: 'failed',
+        output: error.stdout || error.message,
+        error: error.stderr || error.message,
+        timestamp: new Date().toISOString()
+      };
+      
+      throw new Error(`Unit tests failed: ${error.message}`);
+    }
+  }
+
+  async runIntegrationTests() {
+    console.log('🔗 Running integration tests...');
+    
+    // Start test database and services
+    await this.setupTestEnvironment();
+    
+    try {
+      const command = `npx jest --testMatch="${this.config.testPatterns.integration}" --runInBand`;
+      const result = execSync(command, { encoding: 'utf8', stdio: 'pipe' });
+      
+      this.testResults.integration = {
+        status: 'passed',
+        output: result,
+        timestamp: new Date().toISOString()
+      };
+      
+      console.log('✅ Integration tests passed');
+    } catch (error) {
+      this.testResults.integration = {
+        status: 'failed',
+        output: error.stdout || error.message,
+        error: error.stderr || error.message,
+        timestamp: new Date().toISOString()
+      };
+      
+      throw new Error(`Integration tests failed: ${error.message}`);
+    } finally {
+      await this.teardownTestEnvironment();
+    }
+  }
+
+  async runE2ETests() {
+    console.log('🌐 Running E2E tests...');
+    
+    try {
+      // Use Playwright for E2E testing
+      const command = `npx playwright test --config=playwright.config.js`;
+      const result = execSync(command, { encoding: 'utf8', stdio: 'pipe' });
+      
+      this.testResults.e2e = {
+        status: 'passed',
+        output: result,
+        timestamp: new Date().toISOString()
+      };
+      
+      console.log('✅ E2E tests passed');
+    } catch (error) {
+      this.testResults.e2e = {
+        status: 'failed',
+        output: error.stdout || error.message,
+        error: error.stderr || error.message,
+        timestamp: new Date().toISOString()
+      };
+      
+      throw new Error(`E2E tests failed: ${error.message}`);
+    }
+  }
+
+  async setupTestEnvironment() {
+    console.log('⚙️ Setting up test environment...');
+    
+    // Start test database
+    try {
+      execSync('docker-compose -f docker-compose.test.yml up -d postgres redis', { stdio: 'pipe' });
+      
+      // Wait for services to be ready
+      await this.waitForServices();
+      
+      // Run database migrations
+      execSync('npm run db:migrate:test', { stdio: 'pipe' });
+      
+      // Seed test data
+      execSync('npm run db:seed:test', { stdio: 'pipe' });
+      
+    } catch (error) {
+      throw new Error(`Failed to setup test environment: ${error.message}`);
+    }
+  }
+
+  async teardownTestEnvironment() {
+    console.log('🧹 Cleaning up test environment...');
+    
+    try {
+      execSync('docker-compose -f docker-compose.test.yml down', { stdio: 'pipe' });
+    } catch (error) {
+      console.warn('Warning: Failed to cleanup test environment:', error.message);
+    }
+  }
+
+  async waitForServices(timeout = 30000) {
+    const startTime = Date.now();
+    
+    while (Date.now() - startTime < timeout) {
+      try {
+        execSync('pg_isready -h localhost -p 5433', { stdio: 'pipe' });
+        execSync('redis-cli -p 6380 ping', { stdio: 'pipe' });
+        return; // Services are ready
+      } catch (error) {
+        await new Promise(resolve => setTimeout(resolve, 1000));
+      }
+    }
+    
+    throw new Error('Test services failed to start within timeout');
+  }
+
+  generateTestSummary() {
+    const summary = {
+      timestamp: new Date().toISOString(),
+      overall: {
+        status: this.determineOverallStatus(),
+        duration: this.calculateTotalDuration(),
+        testsRun: this.countTotalTests()
+      },
+      results: this.testResults,
+      coverage: this.parseCoverageReport(),
+      recommendations: this.generateRecommendations()
+    };
+
+    console.log('\n📊 Test Summary:');
+    console.log(`Overall Status: ${summary.overall.status}`);
+    console.log(`Total Duration: ${summary.overall.duration}ms`);
+    console.log(`Tests Run: ${summary.overall.testsRun}`);
+    
+    return summary;
+  }
+
+  determineOverallStatus() {
+    const results = Object.values(this.testResults);
+    const failures = results.filter(result => result && result.status === 'failed');
+    return failures.length === 0 ? 'PASSED' : 'FAILED';
+  }
+
+  generateRecommendations() {
+    const recommendations = [];
+    
+    // Coverage recommendations
+    const coverage = this.parseCoverageReport();
+    if (coverage && coverage.total.lines.pct < 80) {
+      recommendations.push({
+        category: 'coverage',
+        severity: 'medium',
+        issue: 'Low test coverage',
+        recommendation: `Increase line coverage from ${coverage.total.lines.pct}% to at least 80%`
+      });
+    }
+    
+    // Failed test recommendations
+    Object.entries(this.testResults).forEach(([type, result]) => {
+      if (result && result.status === 'failed') {
+        recommendations.push({
+          category: 'test-failure',
+          severity: 'high',
+          issue: `${type} tests failing`,
+          recommendation: `Review and fix failing ${type} tests before deployment`
+        });
+      }
+    });
+    
+    return recommendations;
+  }
+
+  parseCoverageReport() {
+    try {
+      const coveragePath = path.join(process.cwd(), 'coverage/coverage-summary.json');
+      if (fs.existsSync(coveragePath)) {
+        return JSON.parse(fs.readFileSync(coveragePath, 'utf8'));
+      }
+    } catch (error) {
+      console.warn('Could not parse coverage report:', error.message);
+    }
+    return null;
+  }
+}
+
+module.exports = { TestSuiteManager };
+```
+
+### 2. Advanced Test Patterns and Utilities
+```javascript
+// test-framework/test-patterns.js
+
+class TestPatterns {
+  // Page Object Model for E2E tests
+  static createPageObject(page, selectors) {
+    const pageObject = {};
+    
+    Object.entries(selectors).forEach(([name, selector]) => {
+      pageObject[name] = {
+        element: () => page.locator(selector),
+        click: () => page.click(selector),
+        fill: (text) => page.fill(selector, text),
+        getText: () => page.textContent(selector),
+        isVisible: () => page.isVisible(selector),
+        waitFor: (options) => page.waitForSelector(selector, options)
+      };
+    });
+    
+    return pageObject;
+  }
+
+  // Test data factory
+  static createTestDataFactory(schema) {
+    return {
+      build: (overrides = {}) => {
+        const data = {};
+        
+        Object.entries(schema).forEach(([key, generator]) => {
+          if (overrides[key] !== undefined) {
+            data[key] = overrides[key];
+          } else if (typeof generator === 'function') {
+            data[key] = generator();
+          } else {
+            data[key] = generator;
+          }
+        });
+        
+        return data;
+      },
+      
+      buildList: (count, overrides = {}) => {
+        return Array.from({ length: count }, (_, index) => 
+          this.build({ ...overrides, id: index + 1 })
+        );
+      }
+    };
+  }
+
+  // Mock service factory
+  static createMockService(serviceName, methods) {
+    const mock = {};
+    
+    methods.forEach(method => {
+      mock[method] = jest.fn();
+    });
+    
+    mock.reset = () => {
+      methods.forEach(method => {
+        mock[method].mockReset();
+      });
+    };
+    
+    mock.restore = () => {
+      methods.forEach(method => {
+        mock[method].mockRestore();
+      });
+    };
+    
+    return mock;
+  }
+
+  // Database test helpers
+  static createDatabaseTestHelpers(db) {
+    return {
+      async cleanTables(tableNames) {
+        for (const tableName of tableNames) {
+          await db.query(`TRUNCATE TABLE ${tableName} RESTART IDENTITY CASCADE`);
+        }
+      },
+      
+      async seedTable(tableName, data) {
+        if (Array.isArray(data)) {
+          for (const row of data) {
+            await db.query(`INSERT INTO ${tableName} (${Object.keys(row).join(', ')}) VALUES (${Object.keys(row).map((_, i) => `$${i + 1}`).join(', ')})`, Object.values(row));
+          }
+        } else {
+          await db.query(`INSERT INTO ${tableName} (${Object.keys(data).join(', ')}) VALUES (${Object.keys(data).map((_, i) => `$${i + 1}`).join(', ')})`, Object.values(data));
+        }
+      },
+      
+      async getLastInserted(tableName) {
+        const result = await db.query(`SELECT * FROM ${tableName} ORDER BY id DESC LIMIT 1`);
+        return result.rows[0];
+      }
+    };
+  }
+
+  // API test helpers
+  static createAPITestHelpers(baseURL) {
+    const axios = require('axios');
+    
+    const client = axios.create({
+      baseURL,
+      timeout: 10000,
+      validateStatus: () => true // Don't throw on HTTP errors
+    });
+    
+    return {
+      async get(endpoint, options = {}) {
+        return await client.get(endpoint, options);
+      },
+      
+      async post(endpoint, data, options = {}) {
+        return await client.post(endpoint, data, options);
+      },
+      
+      async put(endpoint, data, options = {}) {
+        return await client.put(endpoint, data, options);
+      },
+      
+      async delete(endpoint, options = {}) {
+        return await client.delete(endpoint, options);
+      },
+      
+      withAuth(token) {
+        client.defaults.headers.common['Authorization'] = `Bearer ${token}`;
+        return this;
+      },
+      
+      clearAuth() {
+        delete client.defaults.headers.common['Authorization'];
+        return this;
+      }
+    };
+  }
+}
+
+module.exports = { TestPatterns };
+```
+
+### 3. Test Configuration Templates
+```javascript
+// playwright.config.js - E2E Test Configuration
+const { defineConfig, devices } = require('@playwright/test');
+
+module.exports = defineConfig({
+  testDir: './tests/e2e',
+  fullyParallel: true,
+  forbidOnly: !!process.env.CI,
+  retries: process.env.CI ? 2 : 0,
+  workers: process.env.CI ? 1 : undefined,
+  reporter: [
+    ['html'],
+    ['json', { outputFile: 'test-results/e2e-results.json' }],
+    ['junit', { outputFile: 'test-results/e2e-results.xml' }]
+  ],
+  use: {
+    baseURL: process.env.BASE_URL || 'http://localhost:3000',
+    trace: 'on-first-retry',
+    screenshot: 'only-on-failure',
+    video: 'retain-on-failure'
+  },
+  projects: [
+    {
+      name: 'chromium',
+      use: { ...devices['Desktop Chrome'] },
+    },
+    {
+      name: 'firefox',
+      use: { ...devices['Desktop Firefox'] },
+    },
+    {
+      name: 'webkit',
+      use: { ...devices['Desktop Safari'] },
+    },
+    {
+      name: 'Mobile Chrome',
+      use: { ...devices['Pixel 5'] },
+    },
+    {
+      name: 'Mobile Safari',
+      use: { ...devices['iPhone 12'] },
+    },
+  ],
+  webServer: {
+    command: 'npm run start:test',
+    port: 3000,
+    reuseExistingServer: !process.env.CI,
+  },
+});
+
+// jest.config.js - Unit/Integration Test Configuration
+module.exports = {
+  preset: 'ts-jest',
+  testEnvironment: 'jsdom',
+  roots: ['<rootDir>/src'],
+  testMatch: [
+    '**/__tests__/**/*.+(ts|tsx|js)',
+    '**/*.(test|spec).+(ts|tsx|js)'
+  ],
+  transform: {
+    '^.+\\.(ts|tsx)$': 'ts-jest',
+  },
+  collectCoverageFrom: [
+    'src/**/*.{js,jsx,ts,tsx}',
+    '!src/**/*.d.ts',
+    '!src/test/**/*',
+    '!src/**/*.stories.*',
+    '!src/**/*.test.*'
+  ],
+  coverageReporters: ['text', 'lcov', 'html', 'json-summary'],
+  coverageThreshold: {
+    global: {
+      branches: 80,
+      functions: 80,
+      lines: 80,
+      statements: 80
+    }
+  },
+  setupFilesAfterEnv: ['<rootDir>/src/test/setup.ts'],
+  moduleNameMapping: {
+    '^@/(.*)$': '<rootDir>/src/$1',
+    '\\.(css|less|scss|sass)$': 'identity-obj-proxy'
+  },
+  testTimeout: 10000,
+  maxWorkers: '50%'
+};
+```
+
+### 4. Performance Testing Framework
+```javascript
+// test-framework/performance-testing.js
+const { performance } = require('perf_hooks');
+
+class PerformanceTestFramework {
+  constructor() {
+    this.benchmarks = new Map();
+    this.thresholds = {
+      responseTime: 1000,
+      throughput: 100,
+      errorRate: 0.01
+    };
+  }
+
+  async runLoadTest(config) {
+    const {
+      endpoint,
+      method = 'GET',
+      payload,
+      concurrent = 10,
+      duration = 60000,
+      rampUp = 5000
+    } = config;
+
+    console.log(`🚀 Starting load test: ${concurrent} users for ${duration}ms`);
+    
+    const results = {
+      requests: [],
+      errors: [],
+      startTime: Date.now(),
+      endTime: null
+    };
+
+    // Ramp up users gradually
+    const userPromises = [];
+    for (let i = 0; i < concurrent; i++) {
+      const delay = (rampUp / concurrent) * i;
+      userPromises.push(
+        this.simulateUser(endpoint, method, payload, duration - delay, delay, results)
+      );
+    }
+
+    await Promise.all(userPromises);
+    results.endTime = Date.now();
+
+    return this.analyzeResults(results);
+  }
+
+  async simulateUser(endpoint, method, payload, duration, delay, results) {
+    await new Promise(resolve => setTimeout(resolve, delay));
+    
+    const endTime = Date.now() + duration;
+    
+    while (Date.now() < endTime) {
+      const startTime = performance.now();
+      
+      try {
+        const response = await this.makeRequest(endpoint, method, payload);
+        const endTime = performance.now();
+        
+        results.requests.push({
+          startTime,
+          endTime,
+          duration: endTime - startTime,
+          status: response.status,
+          size: response.data ? JSON.stringify(response.data).length : 0
+        });
+        
+      } catch (error) {
+        results.errors.push({
+          timestamp: Date.now(),
+          error: error.message,
+          type: error.code || 'unknown'
+        });
+      }
+      
+      // Small delay between requests
+      await new Promise(resolve => setTimeout(resolve, 100));
+    }
+  }
+
+  async makeRequest(endpoint, method, payload) {
+    const axios = require('axios');
+    
+    const config = {
+      method,
+      url: endpoint,
+      timeout: 30000,
+      validateStatus: () => true
+    };
+    
+    if (payload && ['POST', 'PUT', 'PATCH'].includes(method.toUpperCase())) {
+      config.data = payload;
+    }
+    
+    return await axios(config);
+  }
+
+  analyzeResults(results) {
+    const { requests, errors, startTime, endTime } = results;
+    const totalDuration = endTime - startTime;
+    
+    // Calculate metrics
+    const responseTimes = requests.map(r => r.duration);
+    const successfulRequests = requests.filter(r => r.status < 400);
+    const failedRequests = requests.filter(r => r.status >= 400);
+    
+    const analysis = {
+      summary: {
+        totalRequests: requests.length,
+        successfulRequests: successfulRequests.length,
+        failedRequests: failedRequests.length + errors.length,
+        errorRate: (failedRequests.length + errors.length) / requests.length,
+        testDuration: totalDuration,
+        throughput: (requests.length / totalDuration) * 1000 // requests per second
+      },
+      responseTime: {
+        min: Math.min(...responseTimes),
+        max: Math.max(...responseTimes),
+        mean: responseTimes.reduce((a, b) => a + b, 0) / responseTimes.length,
+        p50: this.percentile(responseTimes, 50),
+        p90: this.percentile(responseTimes, 90),
+        p95: this.percentile(responseTimes, 95),
+        p99: this.percentile(responseTimes, 99)
+      },
+      errors: {
+        total: errors.length,
+        byType: this.groupBy(errors, 'type'),
+        timeline: errors.map(e => ({ timestamp: e.timestamp, type: e.type }))
+      },
+      recommendations: this.generatePerformanceRecommendations(results)
+    };
+
+    this.logResults(analysis);
+    return analysis;
+  }
+
+  percentile(arr, p) {
+    const sorted = [...arr].sort((a, b) => a - b);
+    const index = Math.ceil((p / 100) * sorted.length) - 1;
+    return sorted[index];
+  }
+
+  groupBy(array, key) {
+    return array.reduce((groups, item) => {
+      const group = item[key];
+      groups[group] = groups[group] || [];
+      groups[group].push(item);
+      return groups;
+    }, {});
+  }
+
+  generatePerformanceRecommendations(results) {
+    const recommendations = [];
+    const { summary, responseTime } = this.analyzeResults(results);
+
+    if (responseTime.mean > this.thresholds.responseTime) {
+      recommendations.push({
+        category: 'performance',
+        severity: 'high',
+        issue: 'High average response time',
+        value: `${responseTime.mean.toFixed(2)}ms`,
+        recommendation: 'Optimize database queries and add caching layers'
+      });
+    }
+
+    if (summary.throughput < this.thresholds.throughput) {
+      recommendations.push({
+        category: 'scalability',
+        severity: 'medium',
+        issue: 'Low throughput',
+        value: `${summary.throughput.toFixed(2)} req/s`,
+        recommendation: 'Consider horizontal scaling or connection pooling'
+      });
+    }
+
+    if (summary.errorRate > this.thresholds.errorRate) {
+      recommendations.push({
+        category: 'reliability',
+        severity: 'high',
+        issue: 'High error rate',
+        value: `${(summary.errorRate * 100).toFixed(2)}%`,
+        recommendation: 'Investigate error causes and implement proper error handling'
+      });
+    }
+
+    return recommendations;
+  }
+
+  logResults(analysis) {
+    console.log('\n📈 Performance Test Results:');
+    console.log(`Total Requests: ${analysis.summary.totalRequests}`);
+    console.log(`Success Rate: ${((analysis.summary.successfulRequests / analysis.summary.totalRequests) * 100).toFixed(2)}%`);
+    console.log(`Throughput: ${analysis.summary.throughput.toFixed(2)} req/s`);
+    console.log(`Average Response Time: ${analysis.responseTime.mean.toFixed(2)}ms`);
+    console.log(`95th Percentile: ${analysis.responseTime.p95.toFixed(2)}ms`);
+    
+    if (analysis.recommendations.length > 0) {
+      console.log('\n⚠️ Recommendations:');
+      analysis.recommendations.forEach(rec => {
+        console.log(`- ${rec.issue}: ${rec.recommendation}`);
+      });
+    }
+  }
+}
+
+module.exports = { PerformanceTestFramework };
+```
+
+### 5. Test Automation CI/CD Integration
+```yaml
+# .github/workflows/test-automation.yml
+name: Test Automation Pipeline
+
+on:
+  push:
+    branches: [ main, develop ]
+  pull_request:
+    branches: [ main ]
+
+jobs:
+  unit-tests:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    
+    - name: Setup Node.js
+      uses: actions/setup-node@v4
+      with:
+        node-version: '18'
+        cache: 'npm'
+    
+    - name: Install dependencies
+      run: npm ci
+    
+    - name: Run unit tests
+      run: npm run test:unit -- --coverage
+    
+    - name: Upload coverage to Codecov
+      uses: codecov/codecov-action@v3
+      with:
+        file: ./coverage/lcov.info
+    
+    - name: Comment coverage on PR
+      uses: romeovs/lcov-reporter-action@v0.3.1
+      with:
+        github-token: ${{ secrets.GITHUB_TOKEN }}
+        lcov-file: ./coverage/lcov.info
+
+  integration-tests:
+    runs-on: ubuntu-latest
+    services:
+      postgres:
+        image: postgres:14
+        env:
+          POSTGRES_PASSWORD: postgres
+          POSTGRES_DB: test_db
+        options: >-
+          --health-cmd pg_isready
+          --health-interval 10s
+          --health-timeout 5s
+          --health-retries 5
+      
+      redis:
+        image: redis:7
+        options: >-
+          --health-cmd "redis-cli ping"
+          --health-interval 10s
+          --health-timeout 5s
+          --health-retries 5
+    
+    steps:
+    - uses: actions/checkout@v4
+    
+    - name: Setup Node.js
+      uses: actions/setup-node@v4
+      with:
+        node-version: '18'
+        cache: 'npm'
+    
+    - name: Install dependencies
+      run: npm ci
+    
+    - name: Run database migrations
+      run: npm run db:migrate
+      env:
+        DATABASE_URL: postgresql://postgres:postgres@localhost:5432/test_db
+    
+    - name: Run integration tests
+      run: npm run test:integration
+      env:
+        DATABASE_URL: postgresql://postgres:postgres@localhost:5432/test_db
+        REDIS_URL: redis://localhost:6379
+
+  e2e-tests:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    
+    - name: Setup Node.js
+      uses: actions/setup-node@v4
+      with:
+        node-version: '18'
+        cache: 'npm'
+    
+    - name: Install dependencies
+      run: npm ci
+    
+    - name: Install Playwright
+      run: npx playwright install --with-deps
+    
+    - name: Build application
+      run: npm run build
+    
+    - name: Run E2E tests
+      run: npm run test:e2e
+    
+    - name: Upload test results
+      uses: actions/upload-artifact@v3
+      if: always()
+      with:
+        name: playwright-report
+        path: playwright-report/
+        retention-days: 30
+
+  performance-tests:
+    runs-on: ubuntu-latest
+    if: github.event_name == 'push' && github.ref == 'refs/heads/main'
+    steps:
+    - uses: actions/checkout@v4
+    
+    - name: Setup Node.js
+      uses: actions/setup-node@v4
+      with:
+        node-version: '18'
+        cache: 'npm'
+    
+    - name: Install dependencies
+      run: npm ci
+    
+    - name: Run performance tests
+      run: npm run test:performance
+    
+    - name: Upload performance results
+      uses: actions/upload-artifact@v3
+      with:
+        name: performance-results
+        path: performance-results/
+
+  security-tests:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    
+    - name: Run security audit
+      run: npm audit --production --audit-level moderate
+    
+    - name: Run CodeQL Analysis
+      uses: github/codeql-action/analyze@v2
+      with:
+        languages: javascript
+```
+
+## Testing Best Practices
+
+### Test Organization
+```javascript
+// Example test structure
+describe('UserService', () => {
+  describe('createUser', () => {
+    it('should create user with valid data', async () => {
+      // Arrange
+      const userData = { email: 'test@example.com', name: 'Test User' };
+      
+      // Act
+      const result = await userService.createUser(userData);
+      
+      // Assert
+      expect(result).toHaveProperty('id');
+      expect(result.email).toBe(userData.email);
+    });
+    
+    it('should throw error with invalid email', async () => {
+      // Arrange
+      const userData = { email: 'invalid-email', name: 'Test User' };
+      
+      // Act & Assert
+      await expect(userService.createUser(userData)).rejects.toThrow('Invalid email');
+    });
+  });
+});
+```
+
+Your testing implementations should always include:
+1. **Test Strategy** - Clear testing approach and coverage goals
+2. **Automation Pipeline** - CI/CD integration with quality gates
+3. **Performance Testing** - Load testing and performance benchmarks
+4. **Quality Metrics** - Coverage, reliability, and performance tracking
+5. **Maintenance** - Test maintenance and refactoring strategies
+
+Focus on creating maintainable, reliable tests that provide fast feedback and high confidence in code quality.
\ No newline at end of file
diff --git a/.claude/skills/incident-response/SKILL.md b/.claude/skills/incident-response/SKILL.md
new file mode 100644
index 0000000..688c514
--- /dev/null
+++ b/.claude/skills/incident-response/SKILL.md
@@ -0,0 +1,254 @@
+---
+name: incident-response
+description: >
+  Geumpumta 백엔드 장애 대응 자동화 스킬. 장애 탐지부터 회고 문서화까지 6단계 전 과정을 통합 처리한다.
+  "장애 발생", "에러 분석해줘", "인시던트 대응", "서버 다운", "알람 울렸어", "/ir", "/incident-response",
+  "Grafana alert 확인", "핫픽스 브랜치 만들어줘", "장애 티켓 생성", "장애 회고 작성" 등의 요청 시 반드시 이 스킬을 사용할 것.
+  특정 단계만 실행하려면 "phase=N" 또는 "2단계부터" 형식으로 시작 지점을 지정할 수 있다.
+---
+
+# Incident Response — Geumpumta 장애 대응 가이드
+
+장애 발생 시 **6개 Phase**를 순서대로 실행한다. 각 Phase 완료 후 사용자에게 다음 단계 진행 여부를 확인한다.
+특정 Phase만 실행할 때는 해당 Phase로 바로 점프한다.
+
+> 각 Phase의 상세 쿼리·템플릿은 아래 레퍼런스 파일에서 필요할 때만 읽어라:
+> - `references/grafana-queries.md` — PromQL / LogQL 쿼리 모음 (Phase 1에서 읽기)
+> - `references/templates.md` — Jira 티켓 · PR Body · 회고 문서 템플릿 (Phase 3, 5, 6에서 읽기)
+> - `references/phase-guide.md` — 각 Phase 판단 기준 및 엣지케이스 처리 (막힐 때 읽기)
+
+---
+
+## 실행 전 확인 사항
+
+사용자가 시작 Phase를 명시하지 않으면 Phase 1부터 시작한다.
+`phase=N` 또는 "N단계부터" 형식이면 해당 Phase로 바로 점프한다.
+
+장애 컨텍스트(에러 메시지, 알람 내용, 증상)가 있으면 Phase 1 분석에 활용한다.
+
+---
+
+## Phase 1 — 장애 탐지 & 원인 분석
+
+**`[Phase 1/6] 장애 탐지 중...`** 출력 후 시작.
+
+`references/grafana-queries.md`를 읽어 적절한 쿼리를 선택한다.
+
+### 순서
+
+1. **Firing Alert 조회** — `mcp__grafana__alerting_manage_rules` (operation: list, states: ["firing", "error", "pending"])
+2. **핵심 메트릭 조회** — `mcp__grafana__query_prometheus` 로 에러율, 응답시간, `up` 상태 확인
+3. **로그 에러 패턴** — `mcp__grafana__query_loki_logs` 로 최근 30분 ERROR/WARN 로그 검색
+4. **자연어 요약** — 아래 형식으로 분석 결과 출력:
+
+```
+## 장애 분석 결과
+- 장애 유형: (인스턴스 다운 / 에러율 급증 / 응답 지연 / DB 연결 불가 / ...)
+- 영향 범위: (prod / dev, 영향받는 엔드포인트 또는 기능)
+- 추정 원인: (1~3개 후보, 근거 포함)
+- 에러 코드: (감지된 ExceptionType 코드 — C001, ST001 등)
+- 발생 시각: (첫 감지 시각)
+- 심각도: Critical / High / Medium
+```
+
+Phase 1 완료 후 → "Phase 2(코드 분석)로 진행할까요?" 확인.
+
+---
+
+## Phase 2 — 코드베이스 컨텍스트 로딩
+
+**`[Phase 2/6] 관련 코드 분석 중...`** 출력 후 시작.
+
+Phase 1 결과의 에러 유형과 에러 코드를 기반으로 관련 도메인을 특정한다.
+
+### 에러 코드 → 도메인 매핑
+| 에러 접두사 | 도메인 경로 |
+|------------|------------|
+| ST (학습)  | `study/` |
+| SE (시즌)  | `rank/` |
+| U (사용자) | `user/` |
+| T (토큰)   | `token/` |
+| F (FCM)    | `fcm/` |
+| B (게시판) | `board/` |
+| W (WiFi)   | `wifi/` |
+| C, S (공통)| `global/` |
+
+### 로딩 순서
+
+1. 해당 도메인의 `ExceptionType` enum — 에러 코드 의미 파악
+2. 관련 `Service` 클래스 — 비즈니스 로직 및 트랜잭션 흐름
+3. 관련 `Repository` — Native Query가 있으면 반드시 확인 (랭킹/통계 영향)
+4. `CLAUDE.md` 핵심 규칙 재확인:
+   - 시즌/캐시 관련이면 `activeSeason` 캐시 eviction 로직 확인
+   - `StudySessionRepository` Native Query 수정 여부 확인
+
+### 출력 형식
+```
+## 코드 컨텍스트
+- 영향 도메인: (도메인명)
+- 핵심 파일: (파일 경로 목록)
+- 문제 가능 지점: (메서드명 + 이유)
+- CLAUDE.md 관련 규칙: (해당하는 규칙)
+- 권장 수정 방향: (구체적인 변경 제안)
+```
+
+Phase 2 완료 후 → "Phase 3(Jira 티켓 생성)으로 진행할까요?" 확인.
+
+---
+
+## Phase 3 — Jira 티켓 생성
+
+**`[Phase 3/6] Jira 티켓 생성 중...`** 출력 후 시작.
+
+`references/templates.md`의 Jira 티켓 템플릿을 읽어 사용한다.
+
+Atlassian MCP를 사용해 Jira에 티켓을 생성한다.
+- 프로젝트: Atlassian MCP에서 사용 가능한 프로젝트 조회 후 적절한 프로젝트 선택
+- 이슈 타입: Bug
+- 우선순위: Phase 1 심각도 기준 (Critical → Highest, High → High, Medium → Medium)
+- 제목: `[INCIDENT] {장애 유형} - {영향 범위} ({발생 날짜})`
+- 설명: Phase 1·2 분석 결과 포함
+
+티켓 생성 후 **티켓 번호(예: INC-42)를 저장**한다. 이후 Phase 4, 5, 6에서 사용한다.
+
+```
+## Jira 티켓 생성 완료
+- 티켓 번호: INC-{N}
+- URL: https://cowngur5460.atlassian.net/browse/INC-{N}
+- 우선순위: {우선순위}
+```
+
+Phase 3 완료 후 → "Phase 4(핫픽스 브랜치 생성)으로 진행할까요?" 확인.
+
+---
+
+## Phase 4 — Git Worktree 핫픽스 브랜치 생성
+
+**`[Phase 4/6] 핫픽스 브랜치 생성 중...`** 출력 후 시작.
+
+현재 작업 브랜치와 완전히 분리된 worktree를 생성한다.
+
+### 실행 명령
+
+```bash
+# 브랜치명: hotfix/{티켓번호}-{간단설명} (영문 소문자, 하이픈)
+# 예: hotfix/INC-42-season-cache-eviction
+
+TICKET={티켓번호}
+DESC={간단설명}  # Phase 2 수정 방향에서 도출
+BRANCH="hotfix/${TICKET}-${DESC}"
+WORKTREE_PATH="../hotfix-${TICKET}"
+
+git worktree add -b "$BRANCH" "$WORKTREE_PATH" dev
+```
+
+worktree 생성 후 사용자에게 안내:
+
+```
+## 핫픽스 Worktree 생성 완료
+- 브랜치: hotfix/{티켓번호}-{설명}
+- 경로: ../hotfix-{티켓번호}/
+- 베이스: dev 브랜치
+
+작업 방법:
+  cd ../hotfix-{티켓번호}   # 해당 디렉토리에서 작업
+  # 수정 완료 후 "Phase 5로 진행" 입력
+
+현재 브랜치({현재브랜치})는 영향 없음.
+```
+
+Phase 4 완료 후 → "핫픽스 작업이 완료되면 Phase 5(PR 생성)로 진행해 주세요." 안내.
+
+---
+
+## Phase 5 — PR 생성
+
+**`[Phase 5/6] PR 생성 중...`** 출력 후 시작.
+
+`references/templates.md`의 PR 템플릿을 읽어 사용한다.
+
+```bash
+cd "../hotfix-{티켓번호}"
+gh pr create \
+  --base dev \
+  --title "[HOTFIX] {티켓번호} {장애 요약}" \
+  --body "$(cat <<'EOF'
+{PR 본문 — templates.md 참고}
+EOF
+)"
+```
+
+PR 생성 후:
+1. PR URL을 Jira 티켓에 코멘트로 추가 (Atlassian MCP)
+2. Jira 티켓 상태를 "In Progress" → "In Review"로 전환
+
+```
+## PR 생성 완료
+- PR URL: {PR URL}
+- Jira 업데이트: {티켓 번호} → In Review
+```
+
+Phase 5 완료 후 → "Phase 6(회고 문서 작성)으로 진행할까요?" 확인.
+
+---
+
+## Phase 6 — 회고 문서 자동 생성 & Notion 저장
+
+**`[Phase 6/6] 회고 문서 작성 중...`** 출력 후 시작.
+
+`references/templates.md`의 회고 문서 템플릿을 읽어 사용한다.
+
+### Notion MCP 연결 상태 확인
+
+Notion MCP가 연결되어 있으면 → Notion 페이지에 자동 저장.
+연결되지 않았으면 → 마크다운 파일(`docs/incidents/YYYY-MM-DD-{티켓번호}.md`)로 로컬 저장 후 Notion 연결 방법 안내.
+
+### 회고 문서 구조 (5-Why 기반)
+
+```markdown
+# 장애 회고: {티켓번호} — {제목}
+
+## 기본 정보
+- 발생 일시 / 감지 일시 / 해결 일시
+- 영향 범위 / 심각도
+
+## 장애 타임라인
+| 시각 | 이벤트 |
+|------|--------|
+
+## 원인 분석 (5-Why)
+- Why 1:
+- Why 2:
+- Why 3:
+- Why 4:
+- Why 5 (근본 원인):
+
+## 조치 내용
+- 즉시 조치 / 핫픽스 내용 / PR 링크
+
+## 재발 방지 대책
+| 항목 | 담당 | 기한 |
+|------|------|------|
+
+## 참고
+- Jira: {티켓 URL}
+- PR: {PR URL}
+- Grafana: 장애 시점 대시보드 링크
+```
+
+---
+
+## 전체 진행 상황 트래커
+
+각 Phase 시작 시 전체 진행 상황을 한 줄로 표시한다:
+
+```
+진행: [1완료] → [2완료] → [3완료] → [4완료] → [5완료] → [6진행중]
+```
+
+---
+
+## 중단 및 재개
+
+사용자가 중간에 중단하면 현재까지의 컨텍스트(티켓 번호, 브랜치명, PR URL)를 요약해 저장한다.
+재개 시 "어디서부터 이어갈까요?" 를 물어보고 해당 Phase로 점프한다.
diff --git a/.claude/skills/incident-response/references/grafana-queries.md b/.claude/skills/incident-response/references/grafana-queries.md
new file mode 100644
index 0000000..56c474c
--- /dev/null
+++ b/.claude/skills/incident-response/references/grafana-queries.md
@@ -0,0 +1,142 @@
+# Grafana 쿼리 모음 — Geumpumta
+
+datasourceUid:
+- Prometheus: `ff9b8fyp7herkf`
+- Loki: `af9bh6f4lepz4a`
+
+---
+
+## Phase 1에서 사용할 PromQL 쿼리
+
+### 인스턴스 상태
+```promql
+# prod 인스턴스 UP 여부 (0=다운, 1=정상)
+up{job="geumpumta-backend-prod"}
+
+# 전체 인스턴스 상태 한눈에
+up{job=~"geumpumta-backend.*"}
+```
+
+### HTTP 에러율
+```promql
+# 5xx 에러율 (1분 평균)
+rate(http_server_requests_seconds_count{job="geumpumta-backend-prod", status=~"5.."}[1m])
+
+# 4xx 에러율
+rate(http_server_requests_seconds_count{job="geumpumta-backend-prod", status=~"4.."}[1m])
+
+# 전체 에러율 %
+sum(rate(http_server_requests_seconds_count{job="geumpumta-backend-prod", status=~"[45].."}[5m]))
+/ sum(rate(http_server_requests_seconds_count{job="geumpumta-backend-prod"}[5m])) * 100
+```
+
+### 응답 시간 (Latency)
+```promql
+# p99 응답시간 (초)
+histogram_quantile(0.99,
+  sum(rate(http_server_requests_seconds_bucket{job="geumpumta-backend-prod"}[5m])) by (le, uri)
+)
+
+# p95 응답시간
+histogram_quantile(0.95,
+  sum(rate(http_server_requests_seconds_bucket{job="geumpumta-backend-prod"}[5m])) by (le)
+)
+```
+
+### JVM / 메모리
+```promql
+# JVM 힙 사용률 %
+jvm_memory_used_bytes{job="geumpumta-backend-prod", area="heap"}
+/ jvm_memory_max_bytes{job="geumpumta-backend-prod", area="heap"} * 100
+
+# GC pause 횟수 증가율
+rate(jvm_gc_pause_seconds_count{job="geumpumta-backend-prod"}[5m])
+```
+
+### DB 커넥션 풀 (HikariCP)
+```promql
+# 활성 커넥션 수
+hikaricp_connections_active{job="geumpumta-backend-prod"}
+
+# 대기 중인 커넥션 수 (높으면 DB 병목)
+hikaricp_connections_pending{job="geumpumta-backend-prod"}
+
+# 커넥션 획득 타임아웃 횟수
+hikaricp_connections_timeout_total{job="geumpumta-backend-prod"}
+```
+
+### Redis
+```promql
+# Redis 연결 상태
+up{job="redis"}
+
+# Redis 메모리 사용량
+redis_memory_used_bytes
+```
+
+### MySQL
+```promql
+# MySQL 연결 상태
+up{job="mysql"}
+
+# 슬로우 쿼리 발생률
+rate(mysql_global_status_slow_queries[5m])
+
+# 활성 커넥션 수
+mysql_global_status_threads_connected
+```
+
+---
+
+## Loki LogQL 쿼리
+
+### 에러 로그 검색 (최근 30분)
+```logql
+{job="geumpumta-backend-prod"} |= "ERROR" | line_format "{{.message}}"
+```
+
+### 특정 예외 검색
+```logql
+# BusinessException 계열
+{job="geumpumta-backend-prod"} |= "BusinessException"
+
+# Spring 예외
+{job="geumpumta-backend-prod"} |= "Exception" |= "ERROR"
+
+# DB 관련 에러
+{job="geumpumta-backend-prod"} |~ "DataAccessException|SQLSyntaxErrorException|HikariPool"
+```
+
+### 에러 코드별 검색
+```logql
+# 시즌(SE) 에러
+{job="geumpumta-backend-prod"} |= "SE0"
+
+# 학습세션(ST) 에러
+{job="geumpumta-backend-prod"} |= "ST0"
+
+# FCM 에러
+{job="geumpumta-backend-prod"} |= "F0"
+```
+
+### 에러 발생 빈도 집계
+```logql
+sum by (level) (
+  count_over_time({job="geumpumta-backend-prod"} |= "ERROR" [5m])
+)
+```
+
+---
+
+## 장애 유형별 추천 쿼리 조합
+
+| 장애 유형 | 우선 확인 쿼리 |
+|----------|--------------|
+| 인스턴스 다운 | `up{job="geumpumta-backend-prod"}` |
+| 에러율 급증 | HTTP 5xx 에러율 + Loki ERROR 패턴 |
+| 응답 지연 | p99 latency + HikariCP pending + 슬로우 쿼리 |
+| 메모리 이슈 | JVM 힙 사용률 + GC pause |
+| DB 연결 장애 | `up{job="mysql"}` + HikariCP timeout + Loki DB 에러 |
+| Redis 장애 | `up{job="redis"}` + 캐시 관련 로그 |
+| 시즌 전환 오류 | SE 에러 코드 + SeasonService 로그 |
+| FCM 장애 | F 에러 코드 + FCM 관련 로그 |
diff --git a/.claude/skills/incident-response/references/phase-guide.md b/.claude/skills/incident-response/references/phase-guide.md
new file mode 100644
index 0000000..343a13e
--- /dev/null
+++ b/.claude/skills/incident-response/references/phase-guide.md
@@ -0,0 +1,122 @@
+# Phase별 판단 기준 및 엣지케이스
+
+막히거나 판단이 애매할 때 이 파일을 참고한다.
+
+---
+
+## Phase 1 — 장애 유형 분류 기준
+
+### 심각도 판단
+| 심각도 | 기준 |
+|--------|------|
+| Critical | prod 인스턴스 다운 / 5xx 에러율 > 10% / 전면 서비스 불가 |
+| High | 특정 기능 불가 / 5xx 에러율 3~10% / 응답시간 p99 > 5초 |
+| Medium | 일부 기능 저하 / 에러율 < 3% / 응답시간 p99 3~5초 |
+
+### Grafana Alert 상태 해석
+- `Alerting (NoData)`: 쿼리 결과가 없음 → 쿼리 레이블 오류이거나 스크레이핑 중단
+- `Alerting`: 실제 조건 충족 → 진짜 장애
+- `Pending`: 조건 충족 중이나 `for` 기간 미달 → 주시 필요
+
+### Loki 로그가 없을 때
+- Loki datasource 쿼리 실패 시 → "Loki 로그 조회 불가, Prometheus 메트릭만으로 분석" 명시
+- 로그 없이 메트릭만으로 추정 원인 도출
+
+---
+
+## Phase 2 — 코드 탐색 전략
+
+### 에러 코드가 없을 때
+Grafana 로그에서 에러 코드가 식별되지 않으면:
+1. 에러 발생 URI를 기준으로 컨트롤러 매핑 → 도메인 특정
+2. 스택 트레이스에서 패키지명으로 도메인 특정
+
+### 캐시 관련 장애 판단
+다음 증상이면 Caffeine 캐시 문제를 우선 의심:
+- 시즌 전환 직후 에러 급증
+- `activeSeason` 관련 NullPointerException
+- `SE` 접두사 에러 코드
+
+→ `rank/service/SeasonService.java` + 캐시 eviction 로직 반드시 확인
+
+### StudySessionRepository Native Query 수정 여부 확인
+`study/` 도메인 장애 시:
+- Native Query 변경이 랭킹/통계(`rank/`, `statistics/`)에 영향을 줄 수 있음
+- 수정 전에 두 도메인 동시 검토 필수
+
+---
+
+## Phase 3 — Jira 티켓 생성 엣지케이스
+
+### Atlassian MCP 연결 실패 시
+Jira 티켓 생성이 불가하면:
+1. 티켓 번호를 임시로 `INC-{YYYYMMDD-HHMM}` 형식으로 생성
+2. 이후 Phase에서 이 임시 번호 사용
+3. Phase 완료 후 "Jira에 수동으로 티켓 생성 필요" 안내
+
+### 프로젝트 키 확인
+Atlassian MCP로 Jira 프로젝트 목록 조회 후 가장 적합한 프로젝트 선택.
+프로젝트가 없으면 사용자에게 프로젝트 키 확인 요청.
+
+---
+
+## Phase 4 — Worktree 생성 엣지케이스
+
+### Worktree 경로 충돌 시
+`../hotfix-{티켓번호}` 경로가 이미 존재하면:
+```bash
+git worktree list  # 기존 worktree 확인
+git worktree remove ../hotfix-{이전티켓}  # 불필요한 worktree 정리
+```
+
+### dev 브랜치가 없을 때
+```bash
+git fetch origin dev
+git worktree add -b "$BRANCH" "$WORKTREE_PATH" origin/dev
+```
+
+### Windows 경로 처리
+Windows에서 `../hotfix-{티켓번호}` 경로:
+- bash: `../hotfix-INC-42`
+- 실제 경로: `C:\geumpumta\hotfix-INC-42`
+
+---
+
+## Phase 5 — PR 생성 엣지케이스
+
+### gh CLI 미인증 시
+```bash
+gh auth status  # 인증 상태 확인
+gh auth login   # 재인증
+```
+
+### 커밋이 없어 PR 생성 불가 시
+핫픽스 작업이 완료되지 않은 경우 → Phase 4로 돌아가 작업 완료 후 재시도.
+
+### PR 생성 후 Jira 업데이트 실패 시
+Atlassian MCP로 코멘트 추가 실패하면 PR URL을 출력하고 수동 업데이트 안내.
+
+---
+
+## Phase 6 — 회고 문서 엣지케이스
+
+### Notion MCP 미연결 시
+`docs/incidents/` 디렉토리에 로컬 저장:
+```bash
+mkdir -p docs/incidents
+# YYYY-MM-DD-{티켓번호}.md 파일 생성
+```
+
+### 장애 해결 시간 불명확 시
+5-Why 분석에서 근본 원인이 불명확하면:
+- "추가 분석 필요" 섹션 추가
+- Action Items에 "근본 원인 심층 분석" 태스크 포함
+
+### 재발 방지 대책 — Geumpumta 공통 패턴
+| 장애 유형 | 공통 재발 방지 대책 |
+|----------|------------------|
+| 캐시 관련 | 캐시 eviction 테스트 케이스 추가 |
+| DB 연결 | HikariCP pool 사이즈 + 타임아웃 설정 검토 |
+| 시즌 전환 | `SeasonRankingSnapshot` Retry 로직 검증 강화 |
+| FCM 장애 | FCM 실패 시 재시도 로직 / 알림 fallback 검토 |
+| 인스턴스 다운 | Grafana Alert 쿼리 정확성 검증 (NoData 방지) |
diff --git a/.claude/skills/incident-response/references/templates.md b/.claude/skills/incident-response/references/templates.md
new file mode 100644
index 0000000..0cf4519
--- /dev/null
+++ b/.claude/skills/incident-response/references/templates.md
@@ -0,0 +1,184 @@
+# 장애 대응 템플릿 모음
+
+---
+
+## Jira 티켓 템플릿 (Phase 3)
+
+**제목 형식:**
+```
+[INCIDENT] {장애 유형} - {영향 범위} ({YYYY-MM-DD})
+```
+
+**설명 본문:**
+```
+## 장애 개요
+- 발생 일시: {발생 시각}
+- 감지 일시: {감지 시각}
+- 심각도: Critical / High / Medium
+- 영향 범위: {prod/dev, 영향받는 기능}
+
+## 증상
+{사용자 또는 모니터링에서 감지된 증상}
+
+## 원인 분석
+{Phase 1 Grafana 분석 결과}
+
+추정 원인:
+1. {원인 후보 1}
+2. {원인 후보 2}
+
+감지된 에러 코드: {ExceptionType 코드}
+
+## 관련 코드
+{Phase 2 코드 분석 결과 — 영향 도메인, 핵심 파일, 문제 지점}
+
+## 조치 계획
+- [ ] 핫픽스 브랜치 생성: hotfix/{티켓번호}-{설명}
+- [ ] 코드 수정
+- [ ] 테스트 (단위 + 통합)
+- [ ] PR → dev 머지
+- [ ] 운영 배포 확인
+- [ ] 회고 문서 작성
+
+## 참고 링크
+- Grafana: https://geumpumta.shop/grafana
+- PR: (생성 후 업데이트)
+```
+
+---
+
+## PR Body 템플릿 (Phase 5)
+
+```markdown
+## 관련 이슈
+Closes {Jira 티켓 URL}
+
+## 장애 요약
+{장애 유형과 영향 범위 한 줄 요약}
+
+## 원인
+{근본 원인 설명}
+
+## 변경 내용
+- {변경 항목 1}
+- {변경 항목 2}
+
+## 테스트 체크리스트
+- [ ] 단위 테스트 통과 (`./gradlew test --tests "{테스트클래스명}"`)
+- [ ] 통합 테스트 통과 (`./gradlew test`)
+- [ ] 로컬 환경 수동 검증
+- [ ] Grafana 에러율 정상 복귀 확인
+
+## 배포 시 주의사항
+{캐시 eviction 필요 여부, DB 마이그레이션 여부, 재시작 필요 여부 등}
+
+## 스크린샷 / 로그 (선택)
+{Grafana 대시보드 캡처 또는 수정 전/후 로그}
+
+---
+🤖 Generated with [Claude Code](https://claude.com/claude-code)
+Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
+```
+
+---
+
+## 회고 문서 템플릿 (Phase 6)
+
+```markdown
+# 장애 회고: {티켓번호} — {제목}
+
+> 작성일: {YYYY-MM-DD} | 작성자: {담당자}
+
+---
+
+## 기본 정보
+
+| 항목 | 내용 |
+|------|------|
+| 발생 일시 | {YYYY-MM-DD HH:mm} KST |
+| 감지 일시 | {YYYY-MM-DD HH:mm} KST |
+| 해결 일시 | {YYYY-MM-DD HH:mm} KST |
+| 총 장애 시간 | {N시간 N분} |
+| 영향 범위 | {prod/dev, 영향받은 기능} |
+| 심각도 | Critical / High / Medium |
+
+---
+
+## 장애 타임라인
+
+| 시각 (KST) | 이벤트 |
+|-----------|--------|
+| HH:mm | Grafana Alert 발화 |
+| HH:mm | 장애 인지 및 분석 시작 |
+| HH:mm | 원인 특정 |
+| HH:mm | 핫픽스 작업 시작 |
+| HH:mm | PR 생성 및 리뷰 |
+| HH:mm | 배포 완료 |
+| HH:mm | 정상화 확인 |
+
+---
+
+## 원인 분석 (5-Why)
+
+- **Why 1:** {첫 번째 왜}
+- **Why 2:** {두 번째 왜}
+- **Why 3:** {세 번째 왜}
+- **Why 4:** {네 번째 왜}
+- **Why 5 (근본 원인):** {최종 원인}
+
+---
+
+## 조치 내용
+
+### 즉시 조치
+{장애 확산 방지를 위해 취한 즉각적인 조치}
+
+### 핫픽스 내용
+{코드 변경 내용 요약}
+
+- PR: {PR URL}
+- 변경 파일: {파일 목록}
+
+---
+
+## 재발 방지 대책
+
+| 항목 | 유형 | 담당 | 기한 |
+|------|------|------|------|
+| {모니터링 개선} | 모니터링 | {담당자} | {YYYY-MM-DD} |
+| {테스트 보강} | 테스트 | {담당자} | {YYYY-MM-DD} |
+| {코드 개선} | 개발 | {담당자} | {YYYY-MM-DD} |
+| {프로세스 개선} | 프로세스 | {담당자} | {YYYY-MM-DD} |
+
+---
+
+## 참고 자료
+
+- Jira 티켓: {티켓 URL}
+- PR: {PR URL}
+- Grafana (장애 시점): https://geumpumta.shop/grafana
+- 관련 로그: {로그 링크}
+```
+
+---
+
+## Notion MCP 연결 방법 (미연결 시 안내)
+
+Notion MCP가 없을 경우 사용자에게 아래 안내를 출력한다:
+
+```
+## Notion MCP 연결 방법
+
+1. Notion Integration 생성:
+   https://www.notion.so/my-integrations → 새 Integration 생성
+
+2. Claude Code에 Notion MCP 추가:
+   claude mcp add notion "npx @notionhq/notion-mcp-server" -s user \
+     -e NOTION_API_KEY="{발급받은 토큰}"
+
+3. Notion 페이지에 Integration 권한 부여:
+   장애 회고를 저장할 페이지 → ... → Connections → Integration 추가
+
+연결 전까지는 로컬 파일로 저장합니다:
+→ docs/incidents/{YYYY-MM-DD}-{티켓번호}.md
+```
diff --git a/.claude/skills/skill-creator/SKILL.md b/.claude/skills/skill-creator/SKILL.md
new file mode 100644
index 0000000..65b3a40
--- /dev/null
+++ b/.claude/skills/skill-creator/SKILL.md
@@ -0,0 +1,485 @@
+---
+name: skill-creator
+description: Create new skills, modify and improve existing skills, and measure skill performance. Use when users want to create a skill from scratch, edit, or optimize an existing skill, run evals to test a skill, benchmark skill performance with variance analysis, or optimize a skill's description for better triggering accuracy.
+---
+
+# Skill Creator
+
+A skill for creating new skills and iteratively improving them.
+
+At a high level, the process of creating a skill goes like this:
+
+- Decide what you want the skill to do and roughly how it should do it
+- Write a draft of the skill
+- Create a few test prompts and run claude-with-access-to-the-skill on them
+- Help the user evaluate the results both qualitatively and quantitatively
+  - While the runs happen in the background, draft some quantitative evals if there aren't any (if there are some, you can either use as is or modify if you feel something needs to change about them). Then explain them to the user (or if they already existed, explain the ones that already exist)
+  - Use the `eval-viewer/generate_review.py` script to show the user the results for them to look at, and also let them look at the quantitative metrics
+- Rewrite the skill based on feedback from the user's evaluation of the results (and also if there are any glaring flaws that become apparent from the quantitative benchmarks)
+- Repeat until you're satisfied
+- Expand the test set and try again at larger scale
+
+Your job when using this skill is to figure out where the user is in this process and then jump in and help them progress through these stages. So for instance, maybe they're like "I want to make a skill for X". You can help narrow down what they mean, write a draft, write the test cases, figure out how they want to evaluate, run all the prompts, and repeat.
+
+On the other hand, maybe they already have a draft of the skill. In this case you can go straight to the eval/iterate part of the loop.
+
+Of course, you should always be flexible and if the user is like "I don't need to run a bunch of evaluations, just vibe with me", you can do that instead.
+
+Then after the skill is done (but again, the order is flexible), you can also run the skill description improver, which we have a whole separate script for, to optimize the triggering of the skill.
+
+Cool? Cool.
+
+## Communicating with the user
+
+The skill creator is liable to be used by people across a wide range of familiarity with coding jargon. If you haven't heard (and how could you, it's only very recently that it started), there's a trend now where the power of Claude is inspiring plumbers to open up their terminals, parents and grandparents to google "how to install npm". On the other hand, the bulk of users are probably fairly computer-literate.
+
+So please pay attention to context cues to understand how to phrase your communication! In the default case, just to give you some idea:
+
+- "evaluation" and "benchmark" are borderline, but OK
+- for "JSON" and "assertion" you want to see serious cues from the user that they know what those things are before using them without explaining them
+
+It's OK to briefly explain terms if you're in doubt, and feel free to clarify terms with a short definition if you're unsure if the user will get it.
+
+---
+
+## Creating a skill
+
+### Capture Intent
+
+Start by understanding the user's intent. The current conversation might already contain a workflow the user wants to capture (e.g., they say "turn this into a skill"). If so, extract answers from the conversation history first — the tools used, the sequence of steps, corrections the user made, input/output formats observed. The user may need to fill the gaps, and should confirm before proceeding to the next step.
+
+1. What should this skill enable Claude to do?
+2. When should this skill trigger? (what user phrases/contexts)
+3. What's the expected output format?
+4. Should we set up test cases to verify the skill works? Skills with objectively verifiable outputs (file transforms, data extraction, code generation, fixed workflow steps) benefit from test cases. Skills with subjective outputs (writing style, art) often don't need them. Suggest the appropriate default based on the skill type, but let the user decide.
+
+### Interview and Research
+
+Proactively ask questions about edge cases, input/output formats, example files, success criteria, and dependencies. Wait to write test prompts until you've got this part ironed out.
+
+Check available MCPs - if useful for research (searching docs, finding similar skills, looking up best practices), research in parallel via subagents if available, otherwise inline. Come prepared with context to reduce burden on the user.
+
+### Write the SKILL.md
+
+Based on the user interview, fill in these components:
+
+- **name**: Skill identifier
+- **description**: When to trigger, what it does. This is the primary triggering mechanism - include both what the skill does AND specific contexts for when to use it. All "when to use" info goes here, not in the body. Note: currently Claude has a tendency to "undertrigger" skills -- to not use them when they'd be useful. To combat this, please make the skill descriptions a little bit "pushy". So for instance, instead of "How to build a simple fast dashboard to display internal Anthropic data.", you might write "How to build a simple fast dashboard to display internal Anthropic data. Make sure to use this skill whenever the user mentions dashboards, data visualization, internal metrics, or wants to display any kind of company data, even if they don't explicitly ask for a 'dashboard.'"
+- **compatibility**: Required tools, dependencies (optional, rarely needed)
+- **the rest of the skill :)**
+
+### Skill Writing Guide
+
+#### Anatomy of a Skill
+
+```
+skill-name/
+├── SKILL.md (required)
+│   ├── YAML frontmatter (name, description required)
+│   └── Markdown instructions
+└── Bundled Resources (optional)
+    ├── scripts/    - Executable code for deterministic/repetitive tasks
+    ├── references/ - Docs loaded into context as needed
+    └── assets/     - Files used in output (templates, icons, fonts)
+```
+
+#### Progressive Disclosure
+
+Skills use a three-level loading system:
+1. **Metadata** (name + description) - Always in context (~100 words)
+2. **SKILL.md body** - In context whenever skill triggers (<500 lines ideal)
+3. **Bundled resources** - As needed (unlimited, scripts can execute without loading)
+
+These word counts are approximate and you can feel free to go longer if needed.
+
+**Key patterns:**
+- Keep SKILL.md under 500 lines; if you're approaching this limit, add an additional layer of hierarchy along with clear pointers about where the model using the skill should go next to follow up.
+- Reference files clearly from SKILL.md with guidance on when to read them
+- For large reference files (>300 lines), include a table of contents
+
+**Domain organization**: When a skill supports multiple domains/frameworks, organize by variant:
+```
+cloud-deploy/
+├── SKILL.md (workflow + selection)
+└── references/
+    ├── aws.md
+    ├── gcp.md
+    └── azure.md
+```
+Claude reads only the relevant reference file.
+
+#### Principle of Lack of Surprise
+
+This goes without saying, but skills must not contain malware, exploit code, or any content that could compromise system security. A skill's contents should not surprise the user in their intent if described. Don't go along with requests to create misleading skills or skills designed to facilitate unauthorized access, data exfiltration, or other malicious activities. Things like a "roleplay as an XYZ" are OK though.
+
+#### Writing Patterns
+
+Prefer using the imperative form in instructions.
+
+**Defining output formats** - You can do it like this:
+```markdown
+## Report structure
+ALWAYS use this exact template:
+# [Title]
+## Executive summary
+## Key findings
+## Recommendations
+```
+
+**Examples pattern** - It's useful to include examples. You can format them like this (but if "Input" and "Output" are in the examples you might want to deviate a little):
+```markdown
+## Commit message format
+**Example 1:**
+Input: Added user authentication with JWT tokens
+Output: feat(auth): implement JWT-based authentication
+```
+
+### Writing Style
+
+Try to explain to the model why things are important in lieu of heavy-handed musty MUSTs. Use theory of mind and try to make the skill general and not super-narrow to specific examples. Start by writing a draft and then look at it with fresh eyes and improve it.
+
+### Test Cases
+
+After writing the skill draft, come up with 2-3 realistic test prompts — the kind of thing a real user would actually say. Share them with the user: [you don't have to use this exact language] "Here are a few test cases I'd like to try. Do these look right, or do you want to add more?" Then run them.
+
+Save test cases to `evals/evals.json`. Don't write assertions yet — just the prompts. You'll draft assertions in the next step while the runs are in progress.
+
+```json
+{
+  "skill_name": "example-skill",
+  "evals": [
+    {
+      "id": 1,
+      "prompt": "User's task prompt",
+      "expected_output": "Description of expected result",
+      "files": []
+    }
+  ]
+}
+```
+
+See `references/schemas.md` for the full schema (including the `assertions` field, which you'll add later).
+
+## Running and evaluating test cases
+
+This section is one continuous sequence — don't stop partway through. Do NOT use `/skill-test` or any other testing skill.
+
+Put results in `<skill-name>-workspace/` as a sibling to the skill directory. Within the workspace, organize results by iteration (`iteration-1/`, `iteration-2/`, etc.) and within that, each test case gets a directory (`eval-0/`, `eval-1/`, etc.). Don't create all of this upfront — just create directories as you go.
+
+### Step 1: Spawn all runs (with-skill AND baseline) in the same turn
+
+For each test case, spawn two subagents in the same turn — one with the skill, one without. This is important: don't spawn the with-skill runs first and then come back for baselines later. Launch everything at once so it all finishes around the same time.
+
+**With-skill run:**
+
+```
+Execute this task:
+- Skill path: <path-to-skill>
+- Task: <eval prompt>
+- Input files: <eval files if any, or "none">
+- Save outputs to: <workspace>/iteration-<N>/eval-<ID>/with_skill/outputs/
+- Outputs to save: <what the user cares about — e.g., "the .docx file", "the final CSV">
+```
+
+**Baseline run** (same prompt, but the baseline depends on context):
+- **Creating a new skill**: no skill at all. Same prompt, no skill path, save to `without_skill/outputs/`.
+- **Improving an existing skill**: the old version. Before editing, snapshot the skill (`cp -r <skill-path> <workspace>/skill-snapshot/`), then point the baseline subagent at the snapshot. Save to `old_skill/outputs/`.
+
+Write an `eval_metadata.json` for each test case (assertions can be empty for now). Give each eval a descriptive name based on what it's testing — not just "eval-0". Use this name for the directory too. If this iteration uses new or modified eval prompts, create these files for each new eval directory — don't assume they carry over from previous iterations.
+
+```json
+{
+  "eval_id": 0,
+  "eval_name": "descriptive-name-here",
+  "prompt": "The user's task prompt",
+  "assertions": []
+}
+```
+
+### Step 2: While runs are in progress, draft assertions
+
+Don't just wait for the runs to finish — you can use this time productively. Draft quantitative assertions for each test case and explain them to the user. If assertions already exist in `evals/evals.json`, review them and explain what they check.
+
+Good assertions are objectively verifiable and have descriptive names — they should read clearly in the benchmark viewer so someone glancing at the results immediately understands what each one checks. Subjective skills (writing style, design quality) are better evaluated qualitatively — don't force assertions onto things that need human judgment.
+
+Update the `eval_metadata.json` files and `evals/evals.json` with the assertions once drafted. Also explain to the user what they'll see in the viewer — both the qualitative outputs and the quantitative benchmark.
+
+### Step 3: As runs complete, capture timing data
+
+When each subagent task completes, you receive a notification containing `total_tokens` and `duration_ms`. Save this data immediately to `timing.json` in the run directory:
+
+```json
+{
+  "total_tokens": 84852,
+  "duration_ms": 23332,
+  "total_duration_seconds": 23.3
+}
+```
+
+This is the only opportunity to capture this data — it comes through the task notification and isn't persisted elsewhere. Process each notification as it arrives rather than trying to batch them.
+
+### Step 4: Grade, aggregate, and launch the viewer
+
+Once all runs are done:
+
+1. **Grade each run** — spawn a grader subagent (or grade inline) that reads `agents/grader.md` and evaluates each assertion against the outputs. Save results to `grading.json` in each run directory. The grading.json expectations array must use the fields `text`, `passed`, and `evidence` (not `name`/`met`/`details` or other variants) — the viewer depends on these exact field names. For assertions that can be checked programmatically, write and run a script rather than eyeballing it — scripts are faster, more reliable, and can be reused across iterations.
+
+2. **Aggregate into benchmark** — run the aggregation script from the skill-creator directory:
+   ```bash
+   python -m scripts.aggregate_benchmark <workspace>/iteration-N --skill-name <name>
+   ```
+   This produces `benchmark.json` and `benchmark.md` with pass_rate, time, and tokens for each configuration, with mean ± stddev and the delta. If generating benchmark.json manually, see `references/schemas.md` for the exact schema the viewer expects.
+Put each with_skill version before its baseline counterpart.
+
+3. **Do an analyst pass** — read the benchmark data and surface patterns the aggregate stats might hide. See `agents/analyzer.md` (the "Analyzing Benchmark Results" section) for what to look for — things like assertions that always pass regardless of skill (non-discriminating), high-variance evals (possibly flaky), and time/token tradeoffs.
+
+4. **Launch the viewer** with both qualitative outputs and quantitative data:
+   ```bash
+   nohup python <skill-creator-path>/eval-viewer/generate_review.py \
+     <workspace>/iteration-N \
+     --skill-name "my-skill" \
+     --benchmark <workspace>/iteration-N/benchmark.json \
+     > /dev/null 2>&1 &
+   VIEWER_PID=$!
+   ```
+   For iteration 2+, also pass `--previous-workspace <workspace>/iteration-<N-1>`.
+
+   **Cowork / headless environments:** If `webbrowser.open()` is not available or the environment has no display, use `--static <output_path>` to write a standalone HTML file instead of starting a server. Feedback will be downloaded as a `feedback.json` file when the user clicks "Submit All Reviews". After download, copy `feedback.json` into the workspace directory for the next iteration to pick up.
+
+Note: please use generate_review.py to create the viewer; there's no need to write custom HTML.
+
+5. **Tell the user** something like: "I've opened the results in your browser. There are two tabs — 'Outputs' lets you click through each test case and leave feedback, 'Benchmark' shows the quantitative comparison. When you're done, come back here and let me know."
+
+### What the user sees in the viewer
+
+The "Outputs" tab shows one test case at a time:
+- **Prompt**: the task that was given
+- **Output**: the files the skill produced, rendered inline where possible
+- **Previous Output** (iteration 2+): collapsed section showing last iteration's output
+- **Formal Grades** (if grading was run): collapsed section showing assertion pass/fail
+- **Feedback**: a textbox that auto-saves as they type
+- **Previous Feedback** (iteration 2+): their comments from last time, shown below the textbox
+
+The "Benchmark" tab shows the stats summary: pass rates, timing, and token usage for each configuration, with per-eval breakdowns and analyst observations.
+
+Navigation is via prev/next buttons or arrow keys. When done, they click "Submit All Reviews" which saves all feedback to `feedback.json`.
+
+### Step 5: Read the feedback
+
+When the user tells you they're done, read `feedback.json`:
+
+```json
+{
+  "reviews": [
+    {"run_id": "eval-0-with_skill", "feedback": "the chart is missing axis labels", "timestamp": "..."},
+    {"run_id": "eval-1-with_skill", "feedback": "", "timestamp": "..."},
+    {"run_id": "eval-2-with_skill", "feedback": "perfect, love this", "timestamp": "..."}
+  ],
+  "status": "complete"
+}
+```
+
+Empty feedback means the user thought it was fine. Focus your improvements on the test cases where the user had specific complaints.
+
+Kill the viewer server when you're done with it:
+
+```bash
+kill $VIEWER_PID 2>/dev/null
+```
+
+---
+
+## Improving the skill
+
+This is the heart of the loop. You've run the test cases, the user has reviewed the results, and now you need to make the skill better based on their feedback.
+
+### How to think about improvements
+
+1. **Generalize from the feedback.** The big picture thing that's happening here is that we're trying to create skills that can be used a million times (maybe literally, maybe even more who knows) across many different prompts. Here you and the user are iterating on only a few examples over and over again because it helps move faster. The user knows these examples in and out and it's quick for them to assess new outputs. But if the skill you and the user are codeveloping works only for those examples, it's useless. Rather than put in fiddly overfitty changes, or oppressively constrictive MUSTs, if there's some stubborn issue, you might try branching out and using different metaphors, or recommending different patterns of working. It's relatively cheap to try and maybe you'll land on something great.
+
+2. **Keep the prompt lean.** Remove things that aren't pulling their weight. Make sure to read the transcripts, not just the final outputs — if it looks like the skill is making the model waste a bunch of time doing things that are unproductive, you can try getting rid of the parts of the skill that are making it do that and seeing what happens.
+
+3. **Explain the why.** Try hard to explain the **why** behind everything you're asking the model to do. Today's LLMs are *smart*. They have good theory of mind and when given a good harness can go beyond rote instructions and really make things happen. Even if the feedback from the user is terse or frustrated, try to actually understand the task and why the user is writing what they wrote, and what they actually wrote, and then transmit this understanding into the instructions. If you find yourself writing ALWAYS or NEVER in all caps, or using super rigid structures, that's a yellow flag — if possible, reframe and explain the reasoning so that the model understands why the thing you're asking for is important. That's a more humane, powerful, and effective approach.
+
+4. **Look for repeated work across test cases.** Read the transcripts from the test runs and notice if the subagents all independently wrote similar helper scripts or took the same multi-step approach to something. If all 3 test cases resulted in the subagent writing a `create_docx.py` or a `build_chart.py`, that's a strong signal the skill should bundle that script. Write it once, put it in `scripts/`, and tell the skill to use it. This saves every future invocation from reinventing the wheel.
+
+This task is pretty important (we are trying to create billions a year in economic value here!) and your thinking time is not the blocker; take your time and really mull things over. I'd suggest writing a draft revision and then looking at it anew and making improvements. Really do your best to get into the head of the user and understand what they want and need.
+
+### The iteration loop
+
+After improving the skill:
+
+1. Apply your improvements to the skill
+2. Rerun all test cases into a new `iteration-<N+1>/` directory, including baseline runs. If you're creating a new skill, the baseline is always `without_skill` (no skill) — that stays the same across iterations. If you're improving an existing skill, use your judgment on what makes sense as the baseline: the original version the user came in with, or the previous iteration.
+3. Launch the reviewer with `--previous-workspace` pointing at the previous iteration
+4. Wait for the user to review and tell you they're done
+5. Read the new feedback, improve again, repeat
+
+Keep going until:
+- The user says they're happy
+- The feedback is all empty (everything looks good)
+- You're not making meaningful progress
+
+---
+
+## Advanced: Blind comparison
+
+For situations where you want a more rigorous comparison between two versions of a skill (e.g., the user asks "is the new version actually better?"), there's a blind comparison system. Read `agents/comparator.md` and `agents/analyzer.md` for the details. The basic idea is: give two outputs to an independent agent without telling it which is which, and let it judge quality. Then analyze why the winner won.
+
+This is optional, requires subagents, and most users won't need it. The human review loop is usually sufficient.
+
+---
+
+## Description Optimization
+
+The description field in SKILL.md frontmatter is the primary mechanism that determines whether Claude invokes a skill. After creating or improving a skill, offer to optimize the description for better triggering accuracy.
+
+### Step 1: Generate trigger eval queries
+
+Create 20 eval queries — a mix of should-trigger and should-not-trigger. Save as JSON:
+
+```json
+[
+  {"query": "the user prompt", "should_trigger": true},
+  {"query": "another prompt", "should_trigger": false}
+]
+```
+
+The queries must be realistic and something a Claude Code or Claude.ai user would actually type. Not abstract requests, but requests that are concrete and specific and have a good amount of detail. For instance, file paths, personal context about the user's job or situation, column names and values, company names, URLs. A little bit of backstory. Some might be in lowercase or contain abbreviations or typos or casual speech. Use a mix of different lengths, and focus on edge cases rather than making them clear-cut (the user will get a chance to sign off on them).
+
+Bad: `"Format this data"`, `"Extract text from PDF"`, `"Create a chart"`
+
+Good: `"ok so my boss just sent me this xlsx file (its in my downloads, called something like 'Q4 sales final FINAL v2.xlsx') and she wants me to add a column that shows the profit margin as a percentage. The revenue is in column C and costs are in column D i think"`
+
+For the **should-trigger** queries (8-10), think about coverage. You want different phrasings of the same intent — some formal, some casual. Include cases where the user doesn't explicitly name the skill or file type but clearly needs it. Throw in some uncommon use cases and cases where this skill competes with another but should win.
+
+For the **should-not-trigger** queries (8-10), the most valuable ones are the near-misses — queries that share keywords or concepts with the skill but actually need something different. Think adjacent domains, ambiguous phrasing where a naive keyword match would trigger but shouldn't, and cases where the query touches on something the skill does but in a context where another tool is more appropriate.
+
+The key thing to avoid: don't make should-not-trigger queries obviously irrelevant. "Write a fibonacci function" as a negative test for a PDF skill is too easy — it doesn't test anything. The negative cases should be genuinely tricky.
+
+### Step 2: Review with user
+
+Present the eval set to the user for review using the HTML template:
+
+1. Read the template from `assets/eval_review.html`
+2. Replace the placeholders:
+   - `__EVAL_DATA_PLACEHOLDER__` → the JSON array of eval items (no quotes around it — it's a JS variable assignment)
+   - `__SKILL_NAME_PLACEHOLDER__` → the skill's name
+   - `__SKILL_DESCRIPTION_PLACEHOLDER__` → the skill's current description
+3. Write to a temp file (e.g., `/tmp/eval_review_<skill-name>.html`) and open it: `open /tmp/eval_review_<skill-name>.html`
+4. The user can edit queries, toggle should-trigger, add/remove entries, then click "Export Eval Set"
+5. The file downloads to `~/Downloads/eval_set.json` — check the Downloads folder for the most recent version in case there are multiple (e.g., `eval_set (1).json`)
+
+This step matters — bad eval queries lead to bad descriptions.
+
+### Step 3: Run the optimization loop
+
+Tell the user: "This will take some time — I'll run the optimization loop in the background and check on it periodically."
+
+Save the eval set to the workspace, then run in the background:
+
+```bash
+python -m scripts.run_loop \
+  --eval-set <path-to-trigger-eval.json> \
+  --skill-path <path-to-skill> \
+  --model <model-id-powering-this-session> \
+  --max-iterations 5 \
+  --verbose
+```
+
+Use the model ID from your system prompt (the one powering the current session) so the triggering test matches what the user actually experiences.
+
+While it runs, periodically tail the output to give the user updates on which iteration it's on and what the scores look like.
+
+This handles the full optimization loop automatically. It splits the eval set into 60% train and 40% held-out test, evaluates the current description (running each query 3 times to get a reliable trigger rate), then calls Claude to propose improvements based on what failed. It re-evaluates each new description on both train and test, iterating up to 5 times. When it's done, it opens an HTML report in the browser showing the results per iteration and returns JSON with `best_description` — selected by test score rather than train score to avoid overfitting.
+
+### How skill triggering works
+
+Understanding the triggering mechanism helps design better eval queries. Skills appear in Claude's `available_skills` list with their name + description, and Claude decides whether to consult a skill based on that description. The important thing to know is that Claude only consults skills for tasks it can't easily handle on its own — simple, one-step queries like "read this PDF" may not trigger a skill even if the description matches perfectly, because Claude can handle them directly with basic tools. Complex, multi-step, or specialized queries reliably trigger skills when the description matches.
+
+This means your eval queries should be substantive enough that Claude would actually benefit from consulting a skill. Simple queries like "read file X" are poor test cases — they won't trigger skills regardless of description quality.
+
+### Step 4: Apply the result
+
+Take `best_description` from the JSON output and update the skill's SKILL.md frontmatter. Show the user before/after and report the scores.
+
+---
+
+### Package and Present (only if `present_files` tool is available)
+
+Check whether you have access to the `present_files` tool. If you don't, skip this step. If you do, package the skill and present the .skill file to the user:
+
+```bash
+python -m scripts.package_skill <path/to/skill-folder>
+```
+
+After packaging, direct the user to the resulting `.skill` file path so they can install it.
+
+---
+
+## Claude.ai-specific instructions
+
+In Claude.ai, the core workflow is the same (draft → test → review → improve → repeat), but because Claude.ai doesn't have subagents, some mechanics change. Here's what to adapt:
+
+**Running test cases**: No subagents means no parallel execution. For each test case, read the skill's SKILL.md, then follow its instructions to accomplish the test prompt yourself. Do them one at a time. This is less rigorous than independent subagents (you wrote the skill and you're also running it, so you have full context), but it's a useful sanity check — and the human review step compensates. Skip the baseline runs — just use the skill to complete the task as requested.
+
+**Reviewing results**: If you can't open a browser (e.g., Claude.ai's VM has no display, or you're on a remote server), skip the browser reviewer entirely. Instead, present results directly in the conversation. For each test case, show the prompt and the output. If the output is a file the user needs to see (like a .docx or .xlsx), save it to the filesystem and tell them where it is so they can download and inspect it. Ask for feedback inline: "How does this look? Anything you'd change?"
+
+**Benchmarking**: Skip the quantitative benchmarking — it relies on baseline comparisons which aren't meaningful without subagents. Focus on qualitative feedback from the user.
+
+**The iteration loop**: Same as before — improve the skill, rerun the test cases, ask for feedback — just without the browser reviewer in the middle. You can still organize results into iteration directories on the filesystem if you have one.
+
+**Description optimization**: This section requires the `claude` CLI tool (specifically `claude -p`) which is only available in Claude Code. Skip it if you're on Claude.ai.
+
+**Blind comparison**: Requires subagents. Skip it.
+
+**Packaging**: The `package_skill.py` script works anywhere with Python and a filesystem. On Claude.ai, you can run it and the user can download the resulting `.skill` file.
+
+**Updating an existing skill**: The user might be asking you to update an existing skill, not create a new one. In this case:
+- **Preserve the original name.** Note the skill's directory name and `name` frontmatter field -- use them unchanged. E.g., if the installed skill is `research-helper`, output `research-helper.skill` (not `research-helper-v2`).
+- **Copy to a writeable location before editing.** The installed skill path may be read-only. Copy to `/tmp/skill-name/`, edit there, and package from the copy.
+- **If packaging manually, stage in `/tmp/` first**, then copy to the output directory -- direct writes may fail due to permissions.
+
+---
+
+## Cowork-Specific Instructions
+
+If you're in Cowork, the main things to know are:
+
+- You have subagents, so the main workflow (spawn test cases in parallel, run baselines, grade, etc.) all works. (However, if you run into severe problems with timeouts, it's OK to run the test prompts in series rather than parallel.)
+- You don't have a browser or display, so when generating the eval viewer, use `--static <output_path>` to write a standalone HTML file instead of starting a server. Then proffer a link that the user can click to open the HTML in their browser.
+- For whatever reason, the Cowork setup seems to disincline Claude from generating the eval viewer after running the tests, so just to reiterate: whether you're in Cowork or in Claude Code, after running tests, you should always generate the eval viewer for the human to look at examples before revising the skill yourself and trying to make corrections, using `generate_review.py` (not writing your own boutique html code). Sorry in advance but I'm gonna go all caps here: GENERATE THE EVAL VIEWER *BEFORE* evaluating inputs yourself. You want to get them in front of the human ASAP!
+- Feedback works differently: since there's no running server, the viewer's "Submit All Reviews" button will download `feedback.json` as a file. You can then read it from there (you may have to request access first).
+- Packaging works — `package_skill.py` just needs Python and a filesystem.
+- Description optimization (`run_loop.py` / `run_eval.py`) should work in Cowork just fine since it uses `claude -p` via subprocess, not a browser, but please save it until you've fully finished making the skill and the user agrees it's in good shape.
+- **Updating an existing skill**: The user might be asking you to update an existing skill, not create a new one. Follow the update guidance in the claude.ai section above.
+
+---
+
+## Reference files
+
+The agents/ directory contains instructions for specialized subagents. Read them when you need to spawn the relevant subagent.
+
+- `agents/grader.md` — How to evaluate assertions against outputs
+- `agents/comparator.md` — How to do blind A/B comparison between two outputs
+- `agents/analyzer.md` — How to analyze why one version beat another
+
+The references/ directory has additional documentation:
+- `references/schemas.md` — JSON structures for evals.json, grading.json, etc.
+
+---
+
+Repeating one more time the core loop here for emphasis:
+
+- Figure out what the skill is about
+- Draft or edit the skill
+- Run claude-with-access-to-the-skill on test prompts
+- With the user, evaluate the outputs:
+  - Create benchmark.json and run `eval-viewer/generate_review.py` to help the user review them
+  - Run quantitative evals
+- Repeat until you and the user are satisfied
+- Package the final skill and return it to the user.
+
+Please add steps to your TodoList, if you have such a thing, to make sure you don't forget. If you're in Cowork, please specifically put "Create evals JSON and run `eval-viewer/generate_review.py` so human can review test cases" in your TodoList to make sure it happens.
+
+Good luck!
diff --git a/.claude/skills/skill-creator/agents/analyzer.md b/.claude/skills/skill-creator/agents/analyzer.md
new file mode 100644
index 0000000..14e41d6
--- /dev/null
+++ b/.claude/skills/skill-creator/agents/analyzer.md
@@ -0,0 +1,274 @@
+# Post-hoc Analyzer Agent
+
+Analyze blind comparison results to understand WHY the winner won and generate improvement suggestions.
+
+## Role
+
+After the blind comparator determines a winner, the Post-hoc Analyzer "unblids" the results by examining the skills and transcripts. The goal is to extract actionable insights: what made the winner better, and how can the loser be improved?
+
+## Inputs
+
+You receive these parameters in your prompt:
+
+- **winner**: "A" or "B" (from blind comparison)
+- **winner_skill_path**: Path to the skill that produced the winning output
+- **winner_transcript_path**: Path to the execution transcript for the winner
+- **loser_skill_path**: Path to the skill that produced the losing output
+- **loser_transcript_path**: Path to the execution transcript for the loser
+- **comparison_result_path**: Path to the blind comparator's output JSON
+- **output_path**: Where to save the analysis results
+
+## Process
+
+### Step 1: Read Comparison Result
+
+1. Read the blind comparator's output at comparison_result_path
+2. Note the winning side (A or B), the reasoning, and any scores
+3. Understand what the comparator valued in the winning output
+
+### Step 2: Read Both Skills
+
+1. Read the winner skill's SKILL.md and key referenced files
+2. Read the loser skill's SKILL.md and key referenced files
+3. Identify structural differences:
+   - Instructions clarity and specificity
+   - Script/tool usage patterns
+   - Example coverage
+   - Edge case handling
+
+### Step 3: Read Both Transcripts
+
+1. Read the winner's transcript
+2. Read the loser's transcript
+3. Compare execution patterns:
+   - How closely did each follow their skill's instructions?
+   - What tools were used differently?
+   - Where did the loser diverge from optimal behavior?
+   - Did either encounter errors or make recovery attempts?
+
+### Step 4: Analyze Instruction Following
+
+For each transcript, evaluate:
+- Did the agent follow the skill's explicit instructions?
+- Did the agent use the skill's provided tools/scripts?
+- Were there missed opportunities to leverage skill content?
+- Did the agent add unnecessary steps not in the skill?
+
+Score instruction following 1-10 and note specific issues.
+
+### Step 5: Identify Winner Strengths
+
+Determine what made the winner better:
+- Clearer instructions that led to better behavior?
+- Better scripts/tools that produced better output?
+- More comprehensive examples that guided edge cases?
+- Better error handling guidance?
+
+Be specific. Quote from skills/transcripts where relevant.
+
+### Step 6: Identify Loser Weaknesses
+
+Determine what held the loser back:
+- Ambiguous instructions that led to suboptimal choices?
+- Missing tools/scripts that forced workarounds?
+- Gaps in edge case coverage?
+- Poor error handling that caused failures?
+
+### Step 7: Generate Improvement Suggestions
+
+Based on the analysis, produce actionable suggestions for improving the loser skill:
+- Specific instruction changes to make
+- Tools/scripts to add or modify
+- Examples to include
+- Edge cases to address
+
+Prioritize by impact. Focus on changes that would have changed the outcome.
+
+### Step 8: Write Analysis Results
+
+Save structured analysis to `{output_path}`.
+
+## Output Format
+
+Write a JSON file with this structure:
+
+```json
+{
+  "comparison_summary": {
+    "winner": "A",
+    "winner_skill": "path/to/winner/skill",
+    "loser_skill": "path/to/loser/skill",
+    "comparator_reasoning": "Brief summary of why comparator chose winner"
+  },
+  "winner_strengths": [
+    "Clear step-by-step instructions for handling multi-page documents",
+    "Included validation script that caught formatting errors",
+    "Explicit guidance on fallback behavior when OCR fails"
+  ],
+  "loser_weaknesses": [
+    "Vague instruction 'process the document appropriately' led to inconsistent behavior",
+    "No script for validation, agent had to improvise and made errors",
+    "No guidance on OCR failure, agent gave up instead of trying alternatives"
+  ],
+  "instruction_following": {
+    "winner": {
+      "score": 9,
+      "issues": [
+        "Minor: skipped optional logging step"
+      ]
+    },
+    "loser": {
+      "score": 6,
+      "issues": [
+        "Did not use the skill's formatting template",
+        "Invented own approach instead of following step 3",
+        "Missed the 'always validate output' instruction"
+      ]
+    }
+  },
+  "improvement_suggestions": [
+    {
+      "priority": "high",
+      "category": "instructions",
+      "suggestion": "Replace 'process the document appropriately' with explicit steps: 1) Extract text, 2) Identify sections, 3) Format per template",
+      "expected_impact": "Would eliminate ambiguity that caused inconsistent behavior"
+    },
+    {
+      "priority": "high",
+      "category": "tools",
+      "suggestion": "Add validate_output.py script similar to winner skill's validation approach",
+      "expected_impact": "Would catch formatting errors before final output"
+    },
+    {
+      "priority": "medium",
+      "category": "error_handling",
+      "suggestion": "Add fallback instructions: 'If OCR fails, try: 1) different resolution, 2) image preprocessing, 3) manual extraction'",
+      "expected_impact": "Would prevent early failure on difficult documents"
+    }
+  ],
+  "transcript_insights": {
+    "winner_execution_pattern": "Read skill -> Followed 5-step process -> Used validation script -> Fixed 2 issues -> Produced output",
+    "loser_execution_pattern": "Read skill -> Unclear on approach -> Tried 3 different methods -> No validation -> Output had errors"
+  }
+}
+```
+
+## Guidelines
+
+- **Be specific**: Quote from skills and transcripts, don't just say "instructions were unclear"
+- **Be actionable**: Suggestions should be concrete changes, not vague advice
+- **Focus on skill improvements**: The goal is to improve the losing skill, not critique the agent
+- **Prioritize by impact**: Which changes would most likely have changed the outcome?
+- **Consider causation**: Did the skill weakness actually cause the worse output, or is it incidental?
+- **Stay objective**: Analyze what happened, don't editorialize
+- **Think about generalization**: Would this improvement help on other evals too?
+
+## Categories for Suggestions
+
+Use these categories to organize improvement suggestions:
+
+| Category | Description |
+|----------|-------------|
+| `instructions` | Changes to the skill's prose instructions |
+| `tools` | Scripts, templates, or utilities to add/modify |
+| `examples` | Example inputs/outputs to include |
+| `error_handling` | Guidance for handling failures |
+| `structure` | Reorganization of skill content |
+| `references` | External docs or resources to add |
+
+## Priority Levels
+
+- **high**: Would likely change the outcome of this comparison
+- **medium**: Would improve quality but may not change win/loss
+- **low**: Nice to have, marginal improvement
+
+---
+
+# Analyzing Benchmark Results
+
+When analyzing benchmark results, the analyzer's purpose is to **surface patterns and anomalies** across multiple runs, not suggest skill improvements.
+
+## Role
+
+Review all benchmark run results and generate freeform notes that help the user understand skill performance. Focus on patterns that wouldn't be visible from aggregate metrics alone.
+
+## Inputs
+
+You receive these parameters in your prompt:
+
+- **benchmark_data_path**: Path to the in-progress benchmark.json with all run results
+- **skill_path**: Path to the skill being benchmarked
+- **output_path**: Where to save the notes (as JSON array of strings)
+
+## Process
+
+### Step 1: Read Benchmark Data
+
+1. Read the benchmark.json containing all run results
+2. Note the configurations tested (with_skill, without_skill)
+3. Understand the run_summary aggregates already calculated
+
+### Step 2: Analyze Per-Assertion Patterns
+
+For each expectation across all runs:
+- Does it **always pass** in both configurations? (may not differentiate skill value)
+- Does it **always fail** in both configurations? (may be broken or beyond capability)
+- Does it **always pass with skill but fail without**? (skill clearly adds value here)
+- Does it **always fail with skill but pass without**? (skill may be hurting)
+- Is it **highly variable**? (flaky expectation or non-deterministic behavior)
+
+### Step 3: Analyze Cross-Eval Patterns
+
+Look for patterns across evals:
+- Are certain eval types consistently harder/easier?
+- Do some evals show high variance while others are stable?
+- Are there surprising results that contradict expectations?
+
+### Step 4: Analyze Metrics Patterns
+
+Look at time_seconds, tokens, tool_calls:
+- Does the skill significantly increase execution time?
+- Is there high variance in resource usage?
+- Are there outlier runs that skew the aggregates?
+
+### Step 5: Generate Notes
+
+Write freeform observations as a list of strings. Each note should:
+- State a specific observation
+- Be grounded in the data (not speculation)
+- Help the user understand something the aggregate metrics don't show
+
+Examples:
+- "Assertion 'Output is a PDF file' passes 100% in both configurations - may not differentiate skill value"
+- "Eval 3 shows high variance (50% ± 40%) - run 2 had an unusual failure that may be flaky"
+- "Without-skill runs consistently fail on table extraction expectations (0% pass rate)"
+- "Skill adds 13s average execution time but improves pass rate by 50%"
+- "Token usage is 80% higher with skill, primarily due to script output parsing"
+- "All 3 without-skill runs for eval 1 produced empty output"
+
+### Step 6: Write Notes
+
+Save notes to `{output_path}` as a JSON array of strings:
+
+```json
+[
+  "Assertion 'Output is a PDF file' passes 100% in both configurations - may not differentiate skill value",
+  "Eval 3 shows high variance (50% ± 40%) - run 2 had an unusual failure",
+  "Without-skill runs consistently fail on table extraction expectations",
+  "Skill adds 13s average execution time but improves pass rate by 50%"
+]
+```
+
+## Guidelines
+
+**DO:**
+- Report what you observe in the data
+- Be specific about which evals, expectations, or runs you're referring to
+- Note patterns that aggregate metrics would hide
+- Provide context that helps interpret the numbers
+
+**DO NOT:**
+- Suggest improvements to the skill (that's for the improvement step, not benchmarking)
+- Make subjective quality judgments ("the output was good/bad")
+- Speculate about causes without evidence
+- Repeat information already in the run_summary aggregates
diff --git a/.claude/skills/skill-creator/agents/comparator.md b/.claude/skills/skill-creator/agents/comparator.md
new file mode 100644
index 0000000..80e00eb
--- /dev/null
+++ b/.claude/skills/skill-creator/agents/comparator.md
@@ -0,0 +1,202 @@
+# Blind Comparator Agent
+
+Compare two outputs WITHOUT knowing which skill produced them.
+
+## Role
+
+The Blind Comparator judges which output better accomplishes the eval task. You receive two outputs labeled A and B, but you do NOT know which skill produced which. This prevents bias toward a particular skill or approach.
+
+Your judgment is based purely on output quality and task completion.
+
+## Inputs
+
+You receive these parameters in your prompt:
+
+- **output_a_path**: Path to the first output file or directory
+- **output_b_path**: Path to the second output file or directory
+- **eval_prompt**: The original task/prompt that was executed
+- **expectations**: List of expectations to check (optional - may be empty)
+
+## Process
+
+### Step 1: Read Both Outputs
+
+1. Examine output A (file or directory)
+2. Examine output B (file or directory)
+3. Note the type, structure, and content of each
+4. If outputs are directories, examine all relevant files inside
+
+### Step 2: Understand the Task
+
+1. Read the eval_prompt carefully
+2. Identify what the task requires:
+   - What should be produced?
+   - What qualities matter (accuracy, completeness, format)?
+   - What would distinguish a good output from a poor one?
+
+### Step 3: Generate Evaluation Rubric
+
+Based on the task, generate a rubric with two dimensions:
+
+**Content Rubric** (what the output contains):
+| Criterion | 1 (Poor) | 3 (Acceptable) | 5 (Excellent) |
+|-----------|----------|----------------|---------------|
+| Correctness | Major errors | Minor errors | Fully correct |
+| Completeness | Missing key elements | Mostly complete | All elements present |
+| Accuracy | Significant inaccuracies | Minor inaccuracies | Accurate throughout |
+
+**Structure Rubric** (how the output is organized):
+| Criterion | 1 (Poor) | 3 (Acceptable) | 5 (Excellent) |
+|-----------|----------|----------------|---------------|
+| Organization | Disorganized | Reasonably organized | Clear, logical structure |
+| Formatting | Inconsistent/broken | Mostly consistent | Professional, polished |
+| Usability | Difficult to use | Usable with effort | Easy to use |
+
+Adapt criteria to the specific task. For example:
+- PDF form → "Field alignment", "Text readability", "Data placement"
+- Document → "Section structure", "Heading hierarchy", "Paragraph flow"
+- Data output → "Schema correctness", "Data types", "Completeness"
+
+### Step 4: Evaluate Each Output Against the Rubric
+
+For each output (A and B):
+
+1. **Score each criterion** on the rubric (1-5 scale)
+2. **Calculate dimension totals**: Content score, Structure score
+3. **Calculate overall score**: Average of dimension scores, scaled to 1-10
+
+### Step 5: Check Assertions (if provided)
+
+If expectations are provided:
+
+1. Check each expectation against output A
+2. Check each expectation against output B
+3. Count pass rates for each output
+4. Use expectation scores as secondary evidence (not the primary decision factor)
+
+### Step 6: Determine the Winner
+
+Compare A and B based on (in priority order):
+
+1. **Primary**: Overall rubric score (content + structure)
+2. **Secondary**: Assertion pass rates (if applicable)
+3. **Tiebreaker**: If truly equal, declare a TIE
+
+Be decisive - ties should be rare. One output is usually better, even if marginally.
+
+### Step 7: Write Comparison Results
+
+Save results to a JSON file at the path specified (or `comparison.json` if not specified).
+
+## Output Format
+
+Write a JSON file with this structure:
+
+```json
+{
+  "winner": "A",
+  "reasoning": "Output A provides a complete solution with proper formatting and all required fields. Output B is missing the date field and has formatting inconsistencies.",
+  "rubric": {
+    "A": {
+      "content": {
+        "correctness": 5,
+        "completeness": 5,
+        "accuracy": 4
+      },
+      "structure": {
+        "organization": 4,
+        "formatting": 5,
+        "usability": 4
+      },
+      "content_score": 4.7,
+      "structure_score": 4.3,
+      "overall_score": 9.0
+    },
+    "B": {
+      "content": {
+        "correctness": 3,
+        "completeness": 2,
+        "accuracy": 3
+      },
+      "structure": {
+        "organization": 3,
+        "formatting": 2,
+        "usability": 3
+      },
+      "content_score": 2.7,
+      "structure_score": 2.7,
+      "overall_score": 5.4
+    }
+  },
+  "output_quality": {
+    "A": {
+      "score": 9,
+      "strengths": ["Complete solution", "Well-formatted", "All fields present"],
+      "weaknesses": ["Minor style inconsistency in header"]
+    },
+    "B": {
+      "score": 5,
+      "strengths": ["Readable output", "Correct basic structure"],
+      "weaknesses": ["Missing date field", "Formatting inconsistencies", "Partial data extraction"]
+    }
+  },
+  "expectation_results": {
+    "A": {
+      "passed": 4,
+      "total": 5,
+      "pass_rate": 0.80,
+      "details": [
+        {"text": "Output includes name", "passed": true},
+        {"text": "Output includes date", "passed": true},
+        {"text": "Format is PDF", "passed": true},
+        {"text": "Contains signature", "passed": false},
+        {"text": "Readable text", "passed": true}
+      ]
+    },
+    "B": {
+      "passed": 3,
+      "total": 5,
+      "pass_rate": 0.60,
+      "details": [
+        {"text": "Output includes name", "passed": true},
+        {"text": "Output includes date", "passed": false},
+        {"text": "Format is PDF", "passed": true},
+        {"text": "Contains signature", "passed": false},
+        {"text": "Readable text", "passed": true}
+      ]
+    }
+  }
+}
+```
+
+If no expectations were provided, omit the `expectation_results` field entirely.
+
+## Field Descriptions
+
+- **winner**: "A", "B", or "TIE"
+- **reasoning**: Clear explanation of why the winner was chosen (or why it's a tie)
+- **rubric**: Structured rubric evaluation for each output
+  - **content**: Scores for content criteria (correctness, completeness, accuracy)
+  - **structure**: Scores for structure criteria (organization, formatting, usability)
+  - **content_score**: Average of content criteria (1-5)
+  - **structure_score**: Average of structure criteria (1-5)
+  - **overall_score**: Combined score scaled to 1-10
+- **output_quality**: Summary quality assessment
+  - **score**: 1-10 rating (should match rubric overall_score)
+  - **strengths**: List of positive aspects
+  - **weaknesses**: List of issues or shortcomings
+- **expectation_results**: (Only if expectations provided)
+  - **passed**: Number of expectations that passed
+  - **total**: Total number of expectations
+  - **pass_rate**: Fraction passed (0.0 to 1.0)
+  - **details**: Individual expectation results
+
+## Guidelines
+
+- **Stay blind**: DO NOT try to infer which skill produced which output. Judge purely on output quality.
+- **Be specific**: Cite specific examples when explaining strengths and weaknesses.
+- **Be decisive**: Choose a winner unless outputs are genuinely equivalent.
+- **Output quality first**: Assertion scores are secondary to overall task completion.
+- **Be objective**: Don't favor outputs based on style preferences; focus on correctness and completeness.
+- **Explain your reasoning**: The reasoning field should make it clear why you chose the winner.
+- **Handle edge cases**: If both outputs fail, pick the one that fails less badly. If both are excellent, pick the one that's marginally better.
diff --git a/.claude/skills/skill-creator/agents/grader.md b/.claude/skills/skill-creator/agents/grader.md
new file mode 100644
index 0000000..558ab05
--- /dev/null
+++ b/.claude/skills/skill-creator/agents/grader.md
@@ -0,0 +1,223 @@
+# Grader Agent
+
+Evaluate expectations against an execution transcript and outputs.
+
+## Role
+
+The Grader reviews a transcript and output files, then determines whether each expectation passes or fails. Provide clear evidence for each judgment.
+
+You have two jobs: grade the outputs, and critique the evals themselves. A passing grade on a weak assertion is worse than useless — it creates false confidence. When you notice an assertion that's trivially satisfied, or an important outcome that no assertion checks, say so.
+
+## Inputs
+
+You receive these parameters in your prompt:
+
+- **expectations**: List of expectations to evaluate (strings)
+- **transcript_path**: Path to the execution transcript (markdown file)
+- **outputs_dir**: Directory containing output files from execution
+
+## Process
+
+### Step 1: Read the Transcript
+
+1. Read the transcript file completely
+2. Note the eval prompt, execution steps, and final result
+3. Identify any issues or errors documented
+
+### Step 2: Examine Output Files
+
+1. List files in outputs_dir
+2. Read/examine each file relevant to the expectations. If outputs aren't plain text, use the inspection tools provided in your prompt — don't rely solely on what the transcript says the executor produced.
+3. Note contents, structure, and quality
+
+### Step 3: Evaluate Each Assertion
+
+For each expectation:
+
+1. **Search for evidence** in the transcript and outputs
+2. **Determine verdict**:
+   - **PASS**: Clear evidence the expectation is true AND the evidence reflects genuine task completion, not just surface-level compliance
+   - **FAIL**: No evidence, or evidence contradicts the expectation, or the evidence is superficial (e.g., correct filename but empty/wrong content)
+3. **Cite the evidence**: Quote the specific text or describe what you found
+
+### Step 4: Extract and Verify Claims
+
+Beyond the predefined expectations, extract implicit claims from the outputs and verify them:
+
+1. **Extract claims** from the transcript and outputs:
+   - Factual statements ("The form has 12 fields")
+   - Process claims ("Used pypdf to fill the form")
+   - Quality claims ("All fields were filled correctly")
+
+2. **Verify each claim**:
+   - **Factual claims**: Can be checked against the outputs or external sources
+   - **Process claims**: Can be verified from the transcript
+   - **Quality claims**: Evaluate whether the claim is justified
+
+3. **Flag unverifiable claims**: Note claims that cannot be verified with available information
+
+This catches issues that predefined expectations might miss.
+
+### Step 5: Read User Notes
+
+If `{outputs_dir}/user_notes.md` exists:
+1. Read it and note any uncertainties or issues flagged by the executor
+2. Include relevant concerns in the grading output
+3. These may reveal problems even when expectations pass
+
+### Step 6: Critique the Evals
+
+After grading, consider whether the evals themselves could be improved. Only surface suggestions when there's a clear gap.
+
+Good suggestions test meaningful outcomes — assertions that are hard to satisfy without actually doing the work correctly. Think about what makes an assertion *discriminating*: it passes when the skill genuinely succeeds and fails when it doesn't.
+
+Suggestions worth raising:
+- An assertion that passed but would also pass for a clearly wrong output (e.g., checking filename existence but not file content)
+- An important outcome you observed — good or bad — that no assertion covers at all
+- An assertion that can't actually be verified from the available outputs
+
+Keep the bar high. The goal is to flag things the eval author would say "good catch" about, not to nitpick every assertion.
+
+### Step 7: Write Grading Results
+
+Save results to `{outputs_dir}/../grading.json` (sibling to outputs_dir).
+
+## Grading Criteria
+
+**PASS when**:
+- The transcript or outputs clearly demonstrate the expectation is true
+- Specific evidence can be cited
+- The evidence reflects genuine substance, not just surface compliance (e.g., a file exists AND contains correct content, not just the right filename)
+
+**FAIL when**:
+- No evidence found for the expectation
+- Evidence contradicts the expectation
+- The expectation cannot be verified from available information
+- The evidence is superficial — the assertion is technically satisfied but the underlying task outcome is wrong or incomplete
+- The output appears to meet the assertion by coincidence rather than by actually doing the work
+
+**When uncertain**: The burden of proof to pass is on the expectation.
+
+### Step 8: Read Executor Metrics and Timing
+
+1. If `{outputs_dir}/metrics.json` exists, read it and include in grading output
+2. If `{outputs_dir}/../timing.json` exists, read it and include timing data
+
+## Output Format
+
+Write a JSON file with this structure:
+
+```json
+{
+  "expectations": [
+    {
+      "text": "The output includes the name 'John Smith'",
+      "passed": true,
+      "evidence": "Found in transcript Step 3: 'Extracted names: John Smith, Sarah Johnson'"
+    },
+    {
+      "text": "The spreadsheet has a SUM formula in cell B10",
+      "passed": false,
+      "evidence": "No spreadsheet was created. The output was a text file."
+    },
+    {
+      "text": "The assistant used the skill's OCR script",
+      "passed": true,
+      "evidence": "Transcript Step 2 shows: 'Tool: Bash - python ocr_script.py image.png'"
+    }
+  ],
+  "summary": {
+    "passed": 2,
+    "failed": 1,
+    "total": 3,
+    "pass_rate": 0.67
+  },
+  "execution_metrics": {
+    "tool_calls": {
+      "Read": 5,
+      "Write": 2,
+      "Bash": 8
+    },
+    "total_tool_calls": 15,
+    "total_steps": 6,
+    "errors_encountered": 0,
+    "output_chars": 12450,
+    "transcript_chars": 3200
+  },
+  "timing": {
+    "executor_duration_seconds": 165.0,
+    "grader_duration_seconds": 26.0,
+    "total_duration_seconds": 191.0
+  },
+  "claims": [
+    {
+      "claim": "The form has 12 fillable fields",
+      "type": "factual",
+      "verified": true,
+      "evidence": "Counted 12 fields in field_info.json"
+    },
+    {
+      "claim": "All required fields were populated",
+      "type": "quality",
+      "verified": false,
+      "evidence": "Reference section was left blank despite data being available"
+    }
+  ],
+  "user_notes_summary": {
+    "uncertainties": ["Used 2023 data, may be stale"],
+    "needs_review": [],
+    "workarounds": ["Fell back to text overlay for non-fillable fields"]
+  },
+  "eval_feedback": {
+    "suggestions": [
+      {
+        "assertion": "The output includes the name 'John Smith'",
+        "reason": "A hallucinated document that mentions the name would also pass — consider checking it appears as the primary contact with matching phone and email from the input"
+      },
+      {
+        "reason": "No assertion checks whether the extracted phone numbers match the input — I observed incorrect numbers in the output that went uncaught"
+      }
+    ],
+    "overall": "Assertions check presence but not correctness. Consider adding content verification."
+  }
+}
+```
+
+## Field Descriptions
+
+- **expectations**: Array of graded expectations
+  - **text**: The original expectation text
+  - **passed**: Boolean - true if expectation passes
+  - **evidence**: Specific quote or description supporting the verdict
+- **summary**: Aggregate statistics
+  - **passed**: Count of passed expectations
+  - **failed**: Count of failed expectations
+  - **total**: Total expectations evaluated
+  - **pass_rate**: Fraction passed (0.0 to 1.0)
+- **execution_metrics**: Copied from executor's metrics.json (if available)
+  - **output_chars**: Total character count of output files (proxy for tokens)
+  - **transcript_chars**: Character count of transcript
+- **timing**: Wall clock timing from timing.json (if available)
+  - **executor_duration_seconds**: Time spent in executor subagent
+  - **total_duration_seconds**: Total elapsed time for the run
+- **claims**: Extracted and verified claims from the output
+  - **claim**: The statement being verified
+  - **type**: "factual", "process", or "quality"
+  - **verified**: Boolean - whether the claim holds
+  - **evidence**: Supporting or contradicting evidence
+- **user_notes_summary**: Issues flagged by the executor
+  - **uncertainties**: Things the executor wasn't sure about
+  - **needs_review**: Items requiring human attention
+  - **workarounds**: Places where the skill didn't work as expected
+- **eval_feedback**: Improvement suggestions for the evals (only when warranted)
+  - **suggestions**: List of concrete suggestions, each with a `reason` and optionally an `assertion` it relates to
+  - **overall**: Brief assessment — can be "No suggestions, evals look solid" if nothing to flag
+
+## Guidelines
+
+- **Be objective**: Base verdicts on evidence, not assumptions
+- **Be specific**: Quote the exact text that supports your verdict
+- **Be thorough**: Check both transcript and output files
+- **Be consistent**: Apply the same standard to each expectation
+- **Explain failures**: Make it clear why evidence was insufficient
+- **No partial credit**: Each expectation is pass or fail, not partial
diff --git a/.claude/skills/skill-creator/assets/eval_review.html b/.claude/skills/skill-creator/assets/eval_review.html
new file mode 100644
index 0000000..938ff32
--- /dev/null
+++ b/.claude/skills/skill-creator/assets/eval_review.html
@@ -0,0 +1,146 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+  <meta charset="UTF-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1.0">
+  <title>Eval Set Review - __SKILL_NAME_PLACEHOLDER__</title>
+  <link rel="preconnect" href="https://fonts.googleapis.com">
+  <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+  <link href="https://fonts.googleapis.com/css2?family=Poppins:wght@500;600&family=Lora:wght@400;500&display=swap" rel="stylesheet">
+  <style>
+    * { box-sizing: border-box; margin: 0; padding: 0; }
+    body { font-family: 'Lora', Georgia, serif; background: #faf9f5; padding: 2rem; color: #141413; }
+    h1 { font-family: 'Poppins', sans-serif; margin-bottom: 0.5rem; font-size: 1.5rem; }
+    .description { color: #b0aea5; margin-bottom: 1.5rem; font-style: italic; max-width: 900px; }
+    .controls { margin-bottom: 1rem; display: flex; gap: 0.5rem; }
+    .btn { font-family: 'Poppins', sans-serif; padding: 0.5rem 1rem; border: none; border-radius: 6px; cursor: pointer; font-size: 0.875rem; font-weight: 500; }
+    .btn-add { background: #6a9bcc; color: white; }
+    .btn-add:hover { background: #5889b8; }
+    .btn-export { background: #d97757; color: white; }
+    .btn-export:hover { background: #c4613f; }
+    table { width: 100%; max-width: 1100px; border-collapse: collapse; background: white; border-radius: 6px; overflow: hidden; box-shadow: 0 1px 3px rgba(0,0,0,0.08); }
+    th { font-family: 'Poppins', sans-serif; background: #141413; color: #faf9f5; padding: 0.75rem 1rem; text-align: left; font-size: 0.875rem; }
+    td { padding: 0.75rem 1rem; border-bottom: 1px solid #e8e6dc; vertical-align: top; }
+    tr:nth-child(even) td { background: #faf9f5; }
+    tr:hover td { background: #f3f1ea; }
+    .section-header td { background: #e8e6dc; font-family: 'Poppins', sans-serif; font-weight: 500; font-size: 0.8rem; color: #141413; text-transform: uppercase; letter-spacing: 0.05em; }
+    .query-input { width: 100%; padding: 0.4rem; border: 1px solid #e8e6dc; border-radius: 4px; font-size: 0.875rem; font-family: 'Lora', Georgia, serif; resize: vertical; min-height: 60px; }
+    .query-input:focus { outline: none; border-color: #d97757; box-shadow: 0 0 0 2px rgba(217,119,87,0.15); }
+    .toggle { position: relative; display: inline-block; width: 44px; height: 24px; }
+    .toggle input { opacity: 0; width: 0; height: 0; }
+    .toggle .slider { position: absolute; inset: 0; background: #b0aea5; border-radius: 24px; cursor: pointer; transition: 0.2s; }
+    .toggle .slider::before { content: ""; position: absolute; width: 18px; height: 18px; left: 3px; bottom: 3px; background: white; border-radius: 50%; transition: 0.2s; }
+    .toggle input:checked + .slider { background: #d97757; }
+    .toggle input:checked + .slider::before { transform: translateX(20px); }
+    .btn-delete { background: #c44; color: white; padding: 0.3rem 0.6rem; border: none; border-radius: 4px; cursor: pointer; font-size: 0.75rem; font-family: 'Poppins', sans-serif; }
+    .btn-delete:hover { background: #a33; }
+    .summary { margin-top: 1rem; color: #b0aea5; font-size: 0.875rem; }
+  </style>
+</head>
+<body>
+  <h1>Eval Set Review: <span id="skill-name">__SKILL_NAME_PLACEHOLDER__</span></h1>
+  <p class="description">Current description: <span id="skill-desc">__SKILL_DESCRIPTION_PLACEHOLDER__</span></p>
+
+  <div class="controls">
+    <button class="btn btn-add" onclick="addRow()">+ Add Query</button>
+    <button class="btn btn-export" onclick="exportEvalSet()">Export Eval Set</button>
+  </div>
+
+  <table>
+    <thead>
+      <tr>
+        <th style="width:65%">Query</th>
+        <th style="width:18%">Should Trigger</th>
+        <th style="width:10%">Actions</th>
+      </tr>
+    </thead>
+    <tbody id="eval-body"></tbody>
+  </table>
+
+  <p class="summary" id="summary"></p>
+
+  <script>
+    const EVAL_DATA = __EVAL_DATA_PLACEHOLDER__;
+
+    let evalItems = [...EVAL_DATA];
+
+    function render() {
+      const tbody = document.getElementById('eval-body');
+      tbody.innerHTML = '';
+
+      // Sort: should-trigger first, then should-not-trigger
+      const sorted = evalItems
+        .map((item, origIdx) => ({ ...item, origIdx }))
+        .sort((a, b) => (b.should_trigger ? 1 : 0) - (a.should_trigger ? 1 : 0));
+
+      let lastGroup = null;
+      sorted.forEach(item => {
+        const group = item.should_trigger ? 'trigger' : 'no-trigger';
+        if (group !== lastGroup) {
+          const headerRow = document.createElement('tr');
+          headerRow.className = 'section-header';
+          headerRow.innerHTML = `<td colspan="3">${item.should_trigger ? 'Should Trigger' : 'Should NOT Trigger'}</td>`;
+          tbody.appendChild(headerRow);
+          lastGroup = group;
+        }
+
+        const idx = item.origIdx;
+        const tr = document.createElement('tr');
+        tr.innerHTML = `
+          <td><textarea class="query-input" onchange="updateQuery(${idx}, this.value)">${escapeHtml(item.query)}</textarea></td>
+          <td>
+            <label class="toggle">
+              <input type="checkbox" ${item.should_trigger ? 'checked' : ''} onchange="updateTrigger(${idx}, this.checked)">
+              <span class="slider"></span>
+            </label>
+            <span style="margin-left:8px;font-size:0.8rem;color:#b0aea5">${item.should_trigger ? 'Yes' : 'No'}</span>
+          </td>
+          <td><button class="btn-delete" onclick="deleteRow(${idx})">Delete</button></td>
+        `;
+        tbody.appendChild(tr);
+      });
+      updateSummary();
+    }
+
+    function escapeHtml(text) {
+      const div = document.createElement('div');
+      div.textContent = text;
+      return div.innerHTML;
+    }
+
+    function updateQuery(idx, value) { evalItems[idx].query = value; updateSummary(); }
+    function updateTrigger(idx, value) { evalItems[idx].should_trigger = value; render(); }
+    function deleteRow(idx) { evalItems.splice(idx, 1); render(); }
+
+    function addRow() {
+      evalItems.push({ query: '', should_trigger: true });
+      render();
+      const inputs = document.querySelectorAll('.query-input');
+      inputs[inputs.length - 1].focus();
+    }
+
+    function updateSummary() {
+      const trigger = evalItems.filter(i => i.should_trigger).length;
+      const noTrigger = evalItems.filter(i => !i.should_trigger).length;
+      document.getElementById('summary').textContent =
+        `${evalItems.length} queries total: ${trigger} should trigger, ${noTrigger} should not trigger`;
+    }
+
+    function exportEvalSet() {
+      const valid = evalItems.filter(i => i.query.trim() !== '');
+      const data = valid.map(i => ({ query: i.query.trim(), should_trigger: i.should_trigger }));
+      const blob = new Blob([JSON.stringify(data, null, 2)], { type: 'application/json' });
+      const url = URL.createObjectURL(blob);
+      const a = document.createElement('a');
+      a.href = url;
+      a.download = 'eval_set.json';
+      document.body.appendChild(a);
+      a.click();
+      document.body.removeChild(a);
+      URL.revokeObjectURL(url);
+    }
+
+    render();
+  </script>
+</body>
+</html>
diff --git a/.claude/skills/skill-creator/eval-viewer/generate_review.py b/.claude/skills/skill-creator/eval-viewer/generate_review.py
new file mode 100644
index 0000000..7fa5978
--- /dev/null
+++ b/.claude/skills/skill-creator/eval-viewer/generate_review.py
@@ -0,0 +1,471 @@
+#!/usr/bin/env python3
+"""Generate and serve a review page for eval results.
+
+Reads the workspace directory, discovers runs (directories with outputs/),
+embeds all output data into a self-contained HTML page, and serves it via
+a tiny HTTP server. Feedback auto-saves to feedback.json in the workspace.
+
+Usage:
+    python generate_review.py <workspace-path> [--port PORT] [--skill-name NAME]
+    python generate_review.py <workspace-path> --previous-feedback /path/to/old/feedback.json
+
+No dependencies beyond the Python stdlib are required.
+"""
+
+import argparse
+import base64
+import json
+import mimetypes
+import os
+import re
+import signal
+import subprocess
+import sys
+import time
+import webbrowser
+from functools import partial
+from http.server import HTTPServer, BaseHTTPRequestHandler
+from pathlib import Path
+
+# Files to exclude from output listings
+METADATA_FILES = {"transcript.md", "user_notes.md", "metrics.json"}
+
+# Extensions we render as inline text
+TEXT_EXTENSIONS = {
+    ".txt", ".md", ".json", ".csv", ".py", ".js", ".ts", ".tsx", ".jsx",
+    ".yaml", ".yml", ".xml", ".html", ".css", ".sh", ".rb", ".go", ".rs",
+    ".java", ".c", ".cpp", ".h", ".hpp", ".sql", ".r", ".toml",
+}
+
+# Extensions we render as inline images
+IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".gif", ".svg", ".webp"}
+
+# MIME type overrides for common types
+MIME_OVERRIDES = {
+    ".svg": "image/svg+xml",
+    ".xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+    ".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+    ".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
+}
+
+
+def get_mime_type(path: Path) -> str:
+    ext = path.suffix.lower()
+    if ext in MIME_OVERRIDES:
+        return MIME_OVERRIDES[ext]
+    mime, _ = mimetypes.guess_type(str(path))
+    return mime or "application/octet-stream"
+
+
+def find_runs(workspace: Path) -> list[dict]:
+    """Recursively find directories that contain an outputs/ subdirectory."""
+    runs: list[dict] = []
+    _find_runs_recursive(workspace, workspace, runs)
+    runs.sort(key=lambda r: (r.get("eval_id", float("inf")), r["id"]))
+    return runs
+
+
+def _find_runs_recursive(root: Path, current: Path, runs: list[dict]) -> None:
+    if not current.is_dir():
+        return
+
+    outputs_dir = current / "outputs"
+    if outputs_dir.is_dir():
+        run = build_run(root, current)
+        if run:
+            runs.append(run)
+        return
+
+    skip = {"node_modules", ".git", "__pycache__", "skill", "inputs"}
+    for child in sorted(current.iterdir()):
+        if child.is_dir() and child.name not in skip:
+            _find_runs_recursive(root, child, runs)
+
+
+def build_run(root: Path, run_dir: Path) -> dict | None:
+    """Build a run dict with prompt, outputs, and grading data."""
+    prompt = ""
+    eval_id = None
+
+    # Try eval_metadata.json
+    for candidate in [run_dir / "eval_metadata.json", run_dir.parent / "eval_metadata.json"]:
+        if candidate.exists():
+            try:
+                metadata = json.loads(candidate.read_text())
+                prompt = metadata.get("prompt", "")
+                eval_id = metadata.get("eval_id")
+            except (json.JSONDecodeError, OSError):
+                pass
+            if prompt:
+                break
+
+    # Fall back to transcript.md
+    if not prompt:
+        for candidate in [run_dir / "transcript.md", run_dir / "outputs" / "transcript.md"]:
+            if candidate.exists():
+                try:
+                    text = candidate.read_text()
+                    match = re.search(r"## Eval Prompt\n\n([\s\S]*?)(?=\n##|$)", text)
+                    if match:
+                        prompt = match.group(1).strip()
+                except OSError:
+                    pass
+                if prompt:
+                    break
+
+    if not prompt:
+        prompt = "(No prompt found)"
+
+    run_id = str(run_dir.relative_to(root)).replace("/", "-").replace("\\", "-")
+
+    # Collect output files
+    outputs_dir = run_dir / "outputs"
+    output_files: list[dict] = []
+    if outputs_dir.is_dir():
+        for f in sorted(outputs_dir.iterdir()):
+            if f.is_file() and f.name not in METADATA_FILES:
+                output_files.append(embed_file(f))
+
+    # Load grading if present
+    grading = None
+    for candidate in [run_dir / "grading.json", run_dir.parent / "grading.json"]:
+        if candidate.exists():
+            try:
+                grading = json.loads(candidate.read_text())
+            except (json.JSONDecodeError, OSError):
+                pass
+            if grading:
+                break
+
+    return {
+        "id": run_id,
+        "prompt": prompt,
+        "eval_id": eval_id,
+        "outputs": output_files,
+        "grading": grading,
+    }
+
+
+def embed_file(path: Path) -> dict:
+    """Read a file and return an embedded representation."""
+    ext = path.suffix.lower()
+    mime = get_mime_type(path)
+
+    if ext in TEXT_EXTENSIONS:
+        try:
+            content = path.read_text(errors="replace")
+        except OSError:
+            content = "(Error reading file)"
+        return {
+            "name": path.name,
+            "type": "text",
+            "content": content,
+        }
+    elif ext in IMAGE_EXTENSIONS:
+        try:
+            raw = path.read_bytes()
+            b64 = base64.b64encode(raw).decode("ascii")
+        except OSError:
+            return {"name": path.name, "type": "error", "content": "(Error reading file)"}
+        return {
+            "name": path.name,
+            "type": "image",
+            "mime": mime,
+            "data_uri": f"data:{mime};base64,{b64}",
+        }
+    elif ext == ".pdf":
+        try:
+            raw = path.read_bytes()
+            b64 = base64.b64encode(raw).decode("ascii")
+        except OSError:
+            return {"name": path.name, "type": "error", "content": "(Error reading file)"}
+        return {
+            "name": path.name,
+            "type": "pdf",
+            "data_uri": f"data:{mime};base64,{b64}",
+        }
+    elif ext == ".xlsx":
+        try:
+            raw = path.read_bytes()
+            b64 = base64.b64encode(raw).decode("ascii")
+        except OSError:
+            return {"name": path.name, "type": "error", "content": "(Error reading file)"}
+        return {
+            "name": path.name,
+            "type": "xlsx",
+            "data_b64": b64,
+        }
+    else:
+        # Binary / unknown — base64 download link
+        try:
+            raw = path.read_bytes()
+            b64 = base64.b64encode(raw).decode("ascii")
+        except OSError:
+            return {"name": path.name, "type": "error", "content": "(Error reading file)"}
+        return {
+            "name": path.name,
+            "type": "binary",
+            "mime": mime,
+            "data_uri": f"data:{mime};base64,{b64}",
+        }
+
+
+def load_previous_iteration(workspace: Path) -> dict[str, dict]:
+    """Load previous iteration's feedback and outputs.
+
+    Returns a map of run_id -> {"feedback": str, "outputs": list[dict]}.
+    """
+    result: dict[str, dict] = {}
+
+    # Load feedback
+    feedback_map: dict[str, str] = {}
+    feedback_path = workspace / "feedback.json"
+    if feedback_path.exists():
+        try:
+            data = json.loads(feedback_path.read_text())
+            feedback_map = {
+                r["run_id"]: r["feedback"]
+                for r in data.get("reviews", [])
+                if r.get("feedback", "").strip()
+            }
+        except (json.JSONDecodeError, OSError, KeyError):
+            pass
+
+    # Load runs (to get outputs)
+    prev_runs = find_runs(workspace)
+    for run in prev_runs:
+        result[run["id"]] = {
+            "feedback": feedback_map.get(run["id"], ""),
+            "outputs": run.get("outputs", []),
+        }
+
+    # Also add feedback for run_ids that had feedback but no matching run
+    for run_id, fb in feedback_map.items():
+        if run_id not in result:
+            result[run_id] = {"feedback": fb, "outputs": []}
+
+    return result
+
+
+def generate_html(
+    runs: list[dict],
+    skill_name: str,
+    previous: dict[str, dict] | None = None,
+    benchmark: dict | None = None,
+) -> str:
+    """Generate the complete standalone HTML page with embedded data."""
+    template_path = Path(__file__).parent / "viewer.html"
+    template = template_path.read_text()
+
+    # Build previous_feedback and previous_outputs maps for the template
+    previous_feedback: dict[str, str] = {}
+    previous_outputs: dict[str, list[dict]] = {}
+    if previous:
+        for run_id, data in previous.items():
+            if data.get("feedback"):
+                previous_feedback[run_id] = data["feedback"]
+            if data.get("outputs"):
+                previous_outputs[run_id] = data["outputs"]
+
+    embedded = {
+        "skill_name": skill_name,
+        "runs": runs,
+        "previous_feedback": previous_feedback,
+        "previous_outputs": previous_outputs,
+    }
+    if benchmark:
+        embedded["benchmark"] = benchmark
+
+    data_json = json.dumps(embedded)
+
+    return template.replace("/*__EMBEDDED_DATA__*/", f"const EMBEDDED_DATA = {data_json};")
+
+
+# ---------------------------------------------------------------------------
+# HTTP server (stdlib only, zero dependencies)
+# ---------------------------------------------------------------------------
+
+def _kill_port(port: int) -> None:
+    """Kill any process listening on the given port."""
+    try:
+        result = subprocess.run(
+            ["lsof", "-ti", f":{port}"],
+            capture_output=True, text=True, timeout=5,
+        )
+        for pid_str in result.stdout.strip().split("\n"):
+            if pid_str.strip():
+                try:
+                    os.kill(int(pid_str.strip()), signal.SIGTERM)
+                except (ProcessLookupError, ValueError):
+                    pass
+        if result.stdout.strip():
+            time.sleep(0.5)
+    except subprocess.TimeoutExpired:
+        pass
+    except FileNotFoundError:
+        print("Note: lsof not found, cannot check if port is in use", file=sys.stderr)
+
+class ReviewHandler(BaseHTTPRequestHandler):
+    """Serves the review HTML and handles feedback saves.
+
+    Regenerates the HTML on each page load so that refreshing the browser
+    picks up new eval outputs without restarting the server.
+    """
+
+    def __init__(
+        self,
+        workspace: Path,
+        skill_name: str,
+        feedback_path: Path,
+        previous: dict[str, dict],
+        benchmark_path: Path | None,
+        *args,
+        **kwargs,
+    ):
+        self.workspace = workspace
+        self.skill_name = skill_name
+        self.feedback_path = feedback_path
+        self.previous = previous
+        self.benchmark_path = benchmark_path
+        super().__init__(*args, **kwargs)
+
+    def do_GET(self) -> None:
+        if self.path == "/" or self.path == "/index.html":
+            # Regenerate HTML on each request (re-scans workspace for new outputs)
+            runs = find_runs(self.workspace)
+            benchmark = None
+            if self.benchmark_path and self.benchmark_path.exists():
+                try:
+                    benchmark = json.loads(self.benchmark_path.read_text())
+                except (json.JSONDecodeError, OSError):
+                    pass
+            html = generate_html(runs, self.skill_name, self.previous, benchmark)
+            content = html.encode("utf-8")
+            self.send_response(200)
+            self.send_header("Content-Type", "text/html; charset=utf-8")
+            self.send_header("Content-Length", str(len(content)))
+            self.end_headers()
+            self.wfile.write(content)
+        elif self.path == "/api/feedback":
+            data = b"{}"
+            if self.feedback_path.exists():
+                data = self.feedback_path.read_bytes()
+            self.send_response(200)
+            self.send_header("Content-Type", "application/json")
+            self.send_header("Content-Length", str(len(data)))
+            self.end_headers()
+            self.wfile.write(data)
+        else:
+            self.send_error(404)
+
+    def do_POST(self) -> None:
+        if self.path == "/api/feedback":
+            length = int(self.headers.get("Content-Length", 0))
+            body = self.rfile.read(length)
+            try:
+                data = json.loads(body)
+                if not isinstance(data, dict) or "reviews" not in data:
+                    raise ValueError("Expected JSON object with 'reviews' key")
+                self.feedback_path.write_text(json.dumps(data, indent=2) + "\n")
+                resp = b'{"ok":true}'
+                self.send_response(200)
+            except (json.JSONDecodeError, OSError, ValueError) as e:
+                resp = json.dumps({"error": str(e)}).encode()
+                self.send_response(500)
+            self.send_header("Content-Type", "application/json")
+            self.send_header("Content-Length", str(len(resp)))
+            self.end_headers()
+            self.wfile.write(resp)
+        else:
+            self.send_error(404)
+
+    def log_message(self, format: str, *args: object) -> None:
+        # Suppress request logging to keep terminal clean
+        pass
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Generate and serve eval review")
+    parser.add_argument("workspace", type=Path, help="Path to workspace directory")
+    parser.add_argument("--port", "-p", type=int, default=3117, help="Server port (default: 3117)")
+    parser.add_argument("--skill-name", "-n", type=str, default=None, help="Skill name for header")
+    parser.add_argument(
+        "--previous-workspace", type=Path, default=None,
+        help="Path to previous iteration's workspace (shows old outputs and feedback as context)",
+    )
+    parser.add_argument(
+        "--benchmark", type=Path, default=None,
+        help="Path to benchmark.json to show in the Benchmark tab",
+    )
+    parser.add_argument(
+        "--static", "-s", type=Path, default=None,
+        help="Write standalone HTML to this path instead of starting a server",
+    )
+    args = parser.parse_args()
+
+    workspace = args.workspace.resolve()
+    if not workspace.is_dir():
+        print(f"Error: {workspace} is not a directory", file=sys.stderr)
+        sys.exit(1)
+
+    runs = find_runs(workspace)
+    if not runs:
+        print(f"No runs found in {workspace}", file=sys.stderr)
+        sys.exit(1)
+
+    skill_name = args.skill_name or workspace.name.replace("-workspace", "")
+    feedback_path = workspace / "feedback.json"
+
+    previous: dict[str, dict] = {}
+    if args.previous_workspace:
+        previous = load_previous_iteration(args.previous_workspace.resolve())
+
+    benchmark_path = args.benchmark.resolve() if args.benchmark else None
+    benchmark = None
+    if benchmark_path and benchmark_path.exists():
+        try:
+            benchmark = json.loads(benchmark_path.read_text())
+        except (json.JSONDecodeError, OSError):
+            pass
+
+    if args.static:
+        html = generate_html(runs, skill_name, previous, benchmark)
+        args.static.parent.mkdir(parents=True, exist_ok=True)
+        args.static.write_text(html)
+        print(f"\n  Static viewer written to: {args.static}\n")
+        sys.exit(0)
+
+    # Kill any existing process on the target port
+    port = args.port
+    _kill_port(port)
+    handler = partial(ReviewHandler, workspace, skill_name, feedback_path, previous, benchmark_path)
+    try:
+        server = HTTPServer(("127.0.0.1", port), handler)
+    except OSError:
+        # Port still in use after kill attempt — find a free one
+        server = HTTPServer(("127.0.0.1", 0), handler)
+        port = server.server_address[1]
+
+    url = f"http://localhost:{port}"
+    print(f"\n  Eval Viewer")
+    print(f"  ─────────────────────────────────")
+    print(f"  URL:       {url}")
+    print(f"  Workspace: {workspace}")
+    print(f"  Feedback:  {feedback_path}")
+    if previous:
+        print(f"  Previous:  {args.previous_workspace} ({len(previous)} runs)")
+    if benchmark_path:
+        print(f"  Benchmark: {benchmark_path}")
+    print(f"\n  Press Ctrl+C to stop.\n")
+
+    webbrowser.open(url)
+
+    try:
+        server.serve_forever()
+    except KeyboardInterrupt:
+        print("\nStopped.")
+        server.server_close()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.claude/skills/skill-creator/eval-viewer/viewer.html b/.claude/skills/skill-creator/eval-viewer/viewer.html
new file mode 100644
index 0000000..6d8e963
--- /dev/null
+++ b/.claude/skills/skill-creator/eval-viewer/viewer.html
@@ -0,0 +1,1325 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+  <meta charset="UTF-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1.0">
+  <title>Eval Review</title>
+  <link rel="preconnect" href="https://fonts.googleapis.com">
+  <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+  <link href="https://fonts.googleapis.com/css2?family=Poppins:wght@500;600&family=Lora:wght@400;500&display=swap" rel="stylesheet">
+  <script src="https://cdn.sheetjs.com/xlsx-0.20.3/package/dist/xlsx.full.min.js" integrity="sha384-EnyY0/GSHQGSxSgMwaIPzSESbqoOLSexfnSMN2AP+39Ckmn92stwABZynq1JyzdT" crossorigin="anonymous"></script>
+  <style>
+    :root {
+      --bg: #faf9f5;
+      --surface: #ffffff;
+      --border: #e8e6dc;
+      --text: #141413;
+      --text-muted: #b0aea5;
+      --accent: #d97757;
+      --accent-hover: #c4613f;
+      --green: #788c5d;
+      --green-bg: #eef2e8;
+      --red: #c44;
+      --red-bg: #fceaea;
+      --header-bg: #141413;
+      --header-text: #faf9f5;
+      --radius: 6px;
+    }
+
+    * { box-sizing: border-box; margin: 0; padding: 0; }
+
+    body {
+      font-family: 'Lora', Georgia, serif;
+      background: var(--bg);
+      color: var(--text);
+      height: 100vh;
+      display: flex;
+      flex-direction: column;
+    }
+
+    /* ---- Header ---- */
+    .header {
+      background: var(--header-bg);
+      color: var(--header-text);
+      padding: 1rem 2rem;
+      display: flex;
+      justify-content: space-between;
+      align-items: center;
+      flex-shrink: 0;
+    }
+    .header h1 {
+      font-family: 'Poppins', sans-serif;
+      font-size: 1.25rem;
+      font-weight: 600;
+    }
+    .header .instructions {
+      font-size: 0.8rem;
+      opacity: 0.7;
+      margin-top: 0.25rem;
+    }
+    .header .progress {
+      font-size: 0.875rem;
+      opacity: 0.8;
+      text-align: right;
+    }
+
+    /* ---- Main content ---- */
+    .main {
+      flex: 1;
+      overflow-y: auto;
+      padding: 1.5rem 2rem;
+      display: flex;
+      flex-direction: column;
+      gap: 1.25rem;
+    }
+
+    /* ---- Sections ---- */
+    .section {
+      background: var(--surface);
+      border: 1px solid var(--border);
+      border-radius: var(--radius);
+      flex-shrink: 0;
+    }
+    .section-header {
+      font-family: 'Poppins', sans-serif;
+      padding: 0.75rem 1rem;
+      font-size: 0.75rem;
+      font-weight: 500;
+      text-transform: uppercase;
+      letter-spacing: 0.05em;
+      color: var(--text-muted);
+      border-bottom: 1px solid var(--border);
+      background: var(--bg);
+    }
+    .section-body {
+      padding: 1rem;
+    }
+
+    /* ---- Config badge ---- */
+    .config-badge {
+      display: inline-block;
+      padding: 0.2rem 0.625rem;
+      border-radius: 9999px;
+      font-family: 'Poppins', sans-serif;
+      font-size: 0.6875rem;
+      font-weight: 600;
+      text-transform: uppercase;
+      letter-spacing: 0.03em;
+      margin-left: 0.75rem;
+      vertical-align: middle;
+    }
+    .config-badge.config-primary {
+      background: rgba(33, 150, 243, 0.12);
+      color: #1976d2;
+    }
+    .config-badge.config-baseline {
+      background: rgba(255, 193, 7, 0.15);
+      color: #f57f17;
+    }
+
+    /* ---- Prompt ---- */
+    .prompt-text {
+      white-space: pre-wrap;
+      font-size: 0.9375rem;
+      line-height: 1.6;
+    }
+
+    /* ---- Outputs ---- */
+    .output-file {
+      border: 1px solid var(--border);
+      border-radius: var(--radius);
+      overflow: hidden;
+    }
+    .output-file + .output-file {
+      margin-top: 1rem;
+    }
+    .output-file-header {
+      padding: 0.5rem 0.75rem;
+      font-size: 0.8rem;
+      font-weight: 600;
+      color: var(--text-muted);
+      background: var(--bg);
+      border-bottom: 1px solid var(--border);
+      font-family: 'SF Mono', SFMono-Regular, Consolas, 'Liberation Mono', Menlo, monospace;
+      display: flex;
+      justify-content: space-between;
+      align-items: center;
+    }
+    .output-file-header .dl-btn {
+      font-size: 0.7rem;
+      color: var(--accent);
+      text-decoration: none;
+      cursor: pointer;
+      font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif;
+      font-weight: 500;
+      opacity: 0.8;
+    }
+    .output-file-header .dl-btn:hover {
+      opacity: 1;
+      text-decoration: underline;
+    }
+    .output-file-content {
+      padding: 0.75rem;
+      overflow-x: auto;
+    }
+    .output-file-content pre {
+      font-size: 0.8125rem;
+      line-height: 1.5;
+      white-space: pre-wrap;
+      word-break: break-word;
+      font-family: 'SF Mono', SFMono-Regular, Consolas, 'Liberation Mono', Menlo, monospace;
+    }
+    .output-file-content img {
+      max-width: 100%;
+      height: auto;
+      border-radius: 4px;
+    }
+    .output-file-content iframe {
+      width: 100%;
+      height: 600px;
+      border: none;
+    }
+    .output-file-content table {
+      border-collapse: collapse;
+      font-size: 0.8125rem;
+      width: 100%;
+    }
+    .output-file-content table td,
+    .output-file-content table th {
+      border: 1px solid var(--border);
+      padding: 0.375rem 0.5rem;
+      text-align: left;
+    }
+    .output-file-content table th {
+      background: var(--bg);
+      font-weight: 600;
+    }
+    .output-file-content .download-link {
+      display: inline-flex;
+      align-items: center;
+      gap: 0.5rem;
+      padding: 0.5rem 1rem;
+      background: var(--bg);
+      border: 1px solid var(--border);
+      border-radius: 4px;
+      color: var(--accent);
+      text-decoration: none;
+      font-size: 0.875rem;
+      cursor: pointer;
+    }
+    .output-file-content .download-link:hover {
+      background: var(--border);
+    }
+    .empty-state {
+      color: var(--text-muted);
+      font-style: italic;
+      padding: 2rem;
+      text-align: center;
+    }
+
+    /* ---- Feedback ---- */
+    .prev-feedback {
+      background: var(--bg);
+      border: 1px solid var(--border);
+      border-radius: 4px;
+      padding: 0.625rem 0.75rem;
+      margin-top: 0.75rem;
+      font-size: 0.8125rem;
+      color: var(--text-muted);
+      line-height: 1.5;
+    }
+    .prev-feedback-label {
+      font-size: 0.7rem;
+      font-weight: 600;
+      text-transform: uppercase;
+      letter-spacing: 0.04em;
+      margin-bottom: 0.25rem;
+      color: var(--text-muted);
+    }
+    .feedback-textarea {
+      width: 100%;
+      min-height: 100px;
+      padding: 0.75rem;
+      border: 1px solid var(--border);
+      border-radius: 4px;
+      font-family: inherit;
+      font-size: 0.9375rem;
+      line-height: 1.5;
+      resize: vertical;
+      color: var(--text);
+    }
+    .feedback-textarea:focus {
+      outline: none;
+      border-color: var(--accent);
+      box-shadow: 0 0 0 3px rgba(37, 99, 235, 0.1);
+    }
+    .feedback-status {
+      font-size: 0.75rem;
+      color: var(--text-muted);
+      margin-top: 0.5rem;
+      min-height: 1.1em;
+    }
+
+    /* ---- Grades (collapsible) ---- */
+    .grades-toggle {
+      display: flex;
+      align-items: center;
+      cursor: pointer;
+      user-select: none;
+    }
+    .grades-toggle:hover {
+      color: var(--accent);
+    }
+    .grades-toggle .arrow {
+      margin-right: 0.5rem;
+      transition: transform 0.15s;
+      font-size: 0.75rem;
+    }
+    .grades-toggle .arrow.open {
+      transform: rotate(90deg);
+    }
+    .grades-content {
+      display: none;
+      margin-top: 0.75rem;
+    }
+    .grades-content.open {
+      display: block;
+    }
+    .grades-summary {
+      font-size: 0.875rem;
+      margin-bottom: 0.75rem;
+      display: flex;
+      align-items: center;
+      gap: 0.5rem;
+    }
+    .grade-badge {
+      display: inline-block;
+      padding: 0.125rem 0.5rem;
+      border-radius: 9999px;
+      font-size: 0.75rem;
+      font-weight: 600;
+    }
+    .grade-pass { background: var(--green-bg); color: var(--green); }
+    .grade-fail { background: var(--red-bg); color: var(--red); }
+    .assertion-list {
+      list-style: none;
+    }
+    .assertion-item {
+      padding: 0.625rem 0;
+      border-bottom: 1px solid var(--border);
+      font-size: 0.8125rem;
+    }
+    .assertion-item:last-child { border-bottom: none; }
+    .assertion-status {
+      font-weight: 600;
+      margin-right: 0.5rem;
+    }
+    .assertion-status.pass { color: var(--green); }
+    .assertion-status.fail { color: var(--red); }
+    .assertion-evidence {
+      color: var(--text-muted);
+      font-size: 0.75rem;
+      margin-top: 0.25rem;
+      padding-left: 1.5rem;
+    }
+
+    /* ---- View tabs ---- */
+    .view-tabs {
+      display: flex;
+      gap: 0;
+      padding: 0 2rem;
+      background: var(--bg);
+      border-bottom: 1px solid var(--border);
+      flex-shrink: 0;
+    }
+    .view-tab {
+      font-family: 'Poppins', sans-serif;
+      padding: 0.625rem 1.25rem;
+      font-size: 0.8125rem;
+      font-weight: 500;
+      cursor: pointer;
+      border: none;
+      background: none;
+      color: var(--text-muted);
+      border-bottom: 2px solid transparent;
+      transition: all 0.15s;
+    }
+    .view-tab:hover { color: var(--text); }
+    .view-tab.active {
+      color: var(--accent);
+      border-bottom-color: var(--accent);
+    }
+    .view-panel { display: none; }
+    .view-panel.active { display: flex; flex-direction: column; flex: 1; overflow: hidden; }
+
+    /* ---- Benchmark view ---- */
+    .benchmark-view {
+      padding: 1.5rem 2rem;
+      overflow-y: auto;
+      flex: 1;
+    }
+    .benchmark-table {
+      border-collapse: collapse;
+      background: var(--surface);
+      border: 1px solid var(--border);
+      border-radius: var(--radius);
+      font-size: 0.8125rem;
+      width: 100%;
+      margin-bottom: 1.5rem;
+    }
+    .benchmark-table th, .benchmark-table td {
+      padding: 0.625rem 0.75rem;
+      text-align: left;
+      border: 1px solid var(--border);
+    }
+    .benchmark-table th {
+      font-family: 'Poppins', sans-serif;
+      background: var(--header-bg);
+      color: var(--header-text);
+      font-weight: 500;
+      font-size: 0.75rem;
+      text-transform: uppercase;
+      letter-spacing: 0.04em;
+    }
+    .benchmark-table tr:hover { background: var(--bg); }
+    .benchmark-table tr.benchmark-row-with { background: rgba(33, 150, 243, 0.06); }
+    .benchmark-table tr.benchmark-row-without { background: rgba(255, 193, 7, 0.06); }
+    .benchmark-table tr.benchmark-row-with:hover { background: rgba(33, 150, 243, 0.12); }
+    .benchmark-table tr.benchmark-row-without:hover { background: rgba(255, 193, 7, 0.12); }
+    .benchmark-table tr.benchmark-row-avg { font-weight: 600; border-top: 2px solid var(--border); }
+    .benchmark-table tr.benchmark-row-avg.benchmark-row-with { background: rgba(33, 150, 243, 0.12); }
+    .benchmark-table tr.benchmark-row-avg.benchmark-row-without { background: rgba(255, 193, 7, 0.12); }
+    .benchmark-delta-positive { color: var(--green); font-weight: 600; }
+    .benchmark-delta-negative { color: var(--red); font-weight: 600; }
+    .benchmark-notes {
+      background: var(--surface);
+      border: 1px solid var(--border);
+      border-radius: var(--radius);
+      padding: 1rem;
+    }
+    .benchmark-notes h3 {
+      font-family: 'Poppins', sans-serif;
+      font-size: 0.875rem;
+      margin-bottom: 0.75rem;
+    }
+    .benchmark-notes ul {
+      list-style: disc;
+      padding-left: 1.25rem;
+    }
+    .benchmark-notes li {
+      font-size: 0.8125rem;
+      line-height: 1.6;
+      margin-bottom: 0.375rem;
+    }
+    .benchmark-empty {
+      color: var(--text-muted);
+      font-style: italic;
+      text-align: center;
+      padding: 3rem;
+    }
+
+    /* ---- Navigation ---- */
+    .nav {
+      display: flex;
+      justify-content: space-between;
+      align-items: center;
+      padding: 1rem 2rem;
+      border-top: 1px solid var(--border);
+      background: var(--surface);
+      flex-shrink: 0;
+    }
+    .nav-btn {
+      font-family: 'Poppins', sans-serif;
+      padding: 0.5rem 1.25rem;
+      border: 1px solid var(--border);
+      border-radius: var(--radius);
+      background: var(--surface);
+      cursor: pointer;
+      font-size: 0.875rem;
+      font-weight: 500;
+      color: var(--text);
+      transition: all 0.15s;
+    }
+    .nav-btn:hover:not(:disabled) {
+      background: var(--bg);
+      border-color: var(--text-muted);
+    }
+    .nav-btn:disabled {
+      opacity: 0.4;
+      cursor: not-allowed;
+    }
+    .done-btn {
+      font-family: 'Poppins', sans-serif;
+      padding: 0.5rem 1.5rem;
+      border: 1px solid var(--border);
+      border-radius: var(--radius);
+      background: var(--surface);
+      color: var(--text);
+      cursor: pointer;
+      font-size: 0.875rem;
+      font-weight: 500;
+      transition: all 0.15s;
+    }
+    .done-btn:hover {
+      background: var(--bg);
+      border-color: var(--text-muted);
+    }
+    .done-btn.ready {
+      border: none;
+      background: var(--accent);
+      color: white;
+      font-weight: 600;
+    }
+    .done-btn.ready:hover {
+      background: var(--accent-hover);
+    }
+    /* ---- Done overlay ---- */
+    .done-overlay {
+      display: none;
+      position: fixed;
+      inset: 0;
+      background: rgba(0, 0, 0, 0.5);
+      z-index: 100;
+      justify-content: center;
+      align-items: center;
+    }
+    .done-overlay.visible {
+      display: flex;
+    }
+    .done-card {
+      background: var(--surface);
+      border-radius: 12px;
+      padding: 2rem 3rem;
+      text-align: center;
+      box-shadow: 0 20px 60px rgba(0, 0, 0, 0.3);
+      max-width: 500px;
+    }
+    .done-card h2 {
+      font-size: 1.5rem;
+      margin-bottom: 0.5rem;
+    }
+    .done-card p {
+      color: var(--text-muted);
+      margin-bottom: 1.5rem;
+      line-height: 1.5;
+    }
+    .done-card .btn-row {
+      display: flex;
+      gap: 0.5rem;
+      justify-content: center;
+    }
+    .done-card button {
+      padding: 0.5rem 1.25rem;
+      border: 1px solid var(--border);
+      border-radius: var(--radius);
+      background: var(--surface);
+      cursor: pointer;
+      font-size: 0.875rem;
+    }
+    .done-card button:hover {
+      background: var(--bg);
+    }
+    /* ---- Toast ---- */
+    .toast {
+      position: fixed;
+      bottom: 5rem;
+      left: 50%;
+      transform: translateX(-50%);
+      background: var(--header-bg);
+      color: var(--header-text);
+      padding: 0.625rem 1.25rem;
+      border-radius: var(--radius);
+      font-size: 0.875rem;
+      opacity: 0;
+      transition: opacity 0.3s;
+      pointer-events: none;
+      z-index: 200;
+    }
+    .toast.visible {
+      opacity: 1;
+    }
+  </style>
+</head>
+<body>
+  <div id="app" style="height:100vh; display:flex; flex-direction:column;">
+    <div class="header">
+      <div>
+        <h1>Eval Review: <span id="skill-name"></span></h1>
+        <div class="instructions">Review each output and leave feedback below. Navigate with arrow keys or buttons. When done, copy feedback and paste into Claude Code.</div>
+      </div>
+      <div class="progress" id="progress"></div>
+    </div>
+
+    <!-- View tabs (only shown when benchmark data exists) -->
+    <div class="view-tabs" id="view-tabs" style="display:none;">
+      <button class="view-tab active" onclick="switchView('outputs')">Outputs</button>
+      <button class="view-tab" onclick="switchView('benchmark')">Benchmark</button>
+    </div>
+
+    <!-- Outputs panel (qualitative review) -->
+    <div class="view-panel active" id="panel-outputs">
+    <div class="main">
+      <!-- Prompt -->
+      <div class="section">
+        <div class="section-header">Prompt <span class="config-badge" id="config-badge" style="display:none;"></span></div>
+        <div class="section-body">
+          <div class="prompt-text" id="prompt-text"></div>
+        </div>
+      </div>
+
+      <!-- Outputs -->
+      <div class="section">
+        <div class="section-header">Output</div>
+        <div class="section-body" id="outputs-body">
+          <div class="empty-state">No output files found</div>
+        </div>
+      </div>
+
+      <!-- Previous Output (collapsible) -->
+      <div class="section" id="prev-outputs-section" style="display:none;">
+        <div class="section-header">
+          <div class="grades-toggle" onclick="togglePrevOutputs()">
+            <span class="arrow" id="prev-outputs-arrow">&#9654;</span>
+            Previous Output
+          </div>
+        </div>
+        <div class="grades-content" id="prev-outputs-content"></div>
+      </div>
+
+      <!-- Grades (collapsible) -->
+      <div class="section" id="grades-section" style="display:none;">
+        <div class="section-header">
+          <div class="grades-toggle" onclick="toggleGrades()">
+            <span class="arrow" id="grades-arrow">&#9654;</span>
+            Formal Grades
+          </div>
+        </div>
+        <div class="grades-content" id="grades-content"></div>
+      </div>
+
+      <!-- Feedback -->
+      <div class="section">
+        <div class="section-header">Your Feedback</div>
+        <div class="section-body">
+          <textarea
+            class="feedback-textarea"
+            id="feedback"
+            placeholder="What do you think of this output? Any issues, suggestions, or things that look great?"
+          ></textarea>
+          <div class="feedback-status" id="feedback-status"></div>
+          <div class="prev-feedback" id="prev-feedback" style="display:none;">
+            <div class="prev-feedback-label">Previous feedback</div>
+            <div id="prev-feedback-text"></div>
+          </div>
+        </div>
+      </div>
+    </div>
+
+    <div class="nav" id="outputs-nav">
+      <button class="nav-btn" id="prev-btn" onclick="navigate(-1)">&#8592; Previous</button>
+      <button class="done-btn" id="done-btn" onclick="showDoneDialog()">Submit All Reviews</button>
+      <button class="nav-btn" id="next-btn" onclick="navigate(1)">Next &#8594;</button>
+    </div>
+    </div><!-- end panel-outputs -->
+
+    <!-- Benchmark panel (quantitative stats) -->
+    <div class="view-panel" id="panel-benchmark">
+      <div class="benchmark-view" id="benchmark-content">
+        <div class="benchmark-empty">No benchmark data available. Run a benchmark to see quantitative results here.</div>
+      </div>
+    </div>
+  </div>
+
+  <!-- Done overlay -->
+  <div class="done-overlay" id="done-overlay">
+    <div class="done-card">
+      <h2>Review Complete</h2>
+      <p>Your feedback has been saved. Go back to your Claude Code session and tell Claude you're done reviewing.</p>
+      <div class="btn-row">
+        <button onclick="closeDoneDialog()">OK</button>
+      </div>
+    </div>
+  </div>
+
+  <!-- Toast -->
+  <div class="toast" id="toast"></div>
+
+  <script>
+    // ---- Embedded data (injected by generate_review.py) ----
+    /*__EMBEDDED_DATA__*/
+
+    // ---- State ----
+    let feedbackMap = {};  // run_id -> feedback text
+    let currentIndex = 0;
+    let visitedRuns = new Set();
+
+    // ---- Init ----
+    async function init() {
+      // Load saved feedback from server — but only if this isn't a fresh
+      // iteration (indicated by previous_feedback being present). When
+      // previous feedback exists, the feedback.json on disk is stale from
+      // the prior iteration and should not pre-fill the textareas.
+      const hasPrevious = Object.keys(EMBEDDED_DATA.previous_feedback || {}).length > 0
+        || Object.keys(EMBEDDED_DATA.previous_outputs || {}).length > 0;
+      if (!hasPrevious) {
+        try {
+          const resp = await fetch("/api/feedback");
+          const data = await resp.json();
+          if (data.reviews) {
+            for (const r of data.reviews) feedbackMap[r.run_id] = r.feedback;
+          }
+        } catch { /* first run, no feedback yet */ }
+      }
+
+      document.getElementById("skill-name").textContent = EMBEDDED_DATA.skill_name;
+      showRun(0);
+
+      // Wire up feedback auto-save
+      const textarea = document.getElementById("feedback");
+      let saveTimeout = null;
+      textarea.addEventListener("input", () => {
+        clearTimeout(saveTimeout);
+        document.getElementById("feedback-status").textContent = "";
+        saveTimeout = setTimeout(() => saveCurrentFeedback(), 800);
+      });
+    }
+
+    // ---- Navigation ----
+    function navigate(delta) {
+      const newIndex = currentIndex + delta;
+      if (newIndex >= 0 && newIndex < EMBEDDED_DATA.runs.length) {
+        saveCurrentFeedback();
+        showRun(newIndex);
+      }
+    }
+
+    function updateNavButtons() {
+      document.getElementById("prev-btn").disabled = currentIndex === 0;
+      document.getElementById("next-btn").disabled =
+        currentIndex === EMBEDDED_DATA.runs.length - 1;
+    }
+
+    // ---- Show a run ----
+    function showRun(index) {
+      currentIndex = index;
+      const run = EMBEDDED_DATA.runs[index];
+
+      // Progress
+      document.getElementById("progress").textContent =
+        `${index + 1} of ${EMBEDDED_DATA.runs.length}`;
+
+      // Prompt
+      document.getElementById("prompt-text").textContent = run.prompt;
+
+      // Config badge
+      const badge = document.getElementById("config-badge");
+      const configMatch = run.id.match(/(with_skill|without_skill|new_skill|old_skill)/);
+      if (configMatch) {
+        const config = configMatch[1];
+        const isBaseline = config === "without_skill" || config === "old_skill";
+        badge.textContent = config.replace(/_/g, " ");
+        badge.className = "config-badge " + (isBaseline ? "config-baseline" : "config-primary");
+        badge.style.display = "inline-block";
+      } else {
+        badge.style.display = "none";
+      }
+
+      // Outputs
+      renderOutputs(run);
+
+      // Previous outputs
+      renderPrevOutputs(run);
+
+      // Grades
+      renderGrades(run);
+
+      // Previous feedback
+      const prevFb = (EMBEDDED_DATA.previous_feedback || {})[run.id];
+      const prevEl = document.getElementById("prev-feedback");
+      if (prevFb) {
+        document.getElementById("prev-feedback-text").textContent = prevFb;
+        prevEl.style.display = "block";
+      } else {
+        prevEl.style.display = "none";
+      }
+
+      // Feedback
+      document.getElementById("feedback").value = feedbackMap[run.id] || "";
+      document.getElementById("feedback-status").textContent = "";
+
+      updateNavButtons();
+
+      // Track visited runs and promote done button when all visited
+      visitedRuns.add(index);
+      const doneBtn = document.getElementById("done-btn");
+      if (visitedRuns.size >= EMBEDDED_DATA.runs.length) {
+        doneBtn.classList.add("ready");
+      }
+
+      // Scroll main content to top
+      document.querySelector(".main").scrollTop = 0;
+    }
+
+    // ---- Render outputs ----
+    function renderOutputs(run) {
+      const container = document.getElementById("outputs-body");
+      container.innerHTML = "";
+
+      const outputs = run.outputs || [];
+      if (outputs.length === 0) {
+        container.innerHTML = '<div class="empty-state">No output files</div>';
+        return;
+      }
+
+      for (const file of outputs) {
+        const fileDiv = document.createElement("div");
+        fileDiv.className = "output-file";
+
+        // Always show file header with download link
+        const header = document.createElement("div");
+        header.className = "output-file-header";
+        const nameSpan = document.createElement("span");
+        nameSpan.textContent = file.name;
+        header.appendChild(nameSpan);
+        const dlBtn = document.createElement("a");
+        dlBtn.className = "dl-btn";
+        dlBtn.textContent = "Download";
+        dlBtn.download = file.name;
+        dlBtn.href = getDownloadUri(file);
+        header.appendChild(dlBtn);
+        fileDiv.appendChild(header);
+
+        const content = document.createElement("div");
+        content.className = "output-file-content";
+
+        if (file.type === "text") {
+          const pre = document.createElement("pre");
+          pre.textContent = file.content;
+          content.appendChild(pre);
+        } else if (file.type === "image") {
+          const img = document.createElement("img");
+          img.src = file.data_uri;
+          img.alt = file.name;
+          content.appendChild(img);
+        } else if (file.type === "pdf") {
+          const iframe = document.createElement("iframe");
+          iframe.src = file.data_uri;
+          content.appendChild(iframe);
+        } else if (file.type === "xlsx") {
+          renderXlsx(content, file.data_b64);
+        } else if (file.type === "binary") {
+          const a = document.createElement("a");
+          a.className = "download-link";
+          a.href = file.data_uri;
+          a.download = file.name;
+          a.textContent = "Download " + file.name;
+          content.appendChild(a);
+        } else if (file.type === "error") {
+          const pre = document.createElement("pre");
+          pre.textContent = file.content;
+          pre.style.color = "var(--red)";
+          content.appendChild(pre);
+        }
+
+        fileDiv.appendChild(content);
+        container.appendChild(fileDiv);
+      }
+    }
+
+    // ---- XLSX rendering via SheetJS ----
+    function renderXlsx(container, b64Data) {
+      try {
+        const raw = Uint8Array.from(atob(b64Data), c => c.charCodeAt(0));
+        const wb = XLSX.read(raw, { type: "array" });
+
+        for (let i = 0; i < wb.SheetNames.length; i++) {
+          const sheetName = wb.SheetNames[i];
+          const ws = wb.Sheets[sheetName];
+
+          if (wb.SheetNames.length > 1) {
+            const sheetLabel = document.createElement("div");
+            sheetLabel.style.cssText =
+              "font-weight:600; font-size:0.8rem; color:#b0aea5; margin-top:0.5rem; margin-bottom:0.25rem;";
+            sheetLabel.textContent = "Sheet: " + sheetName;
+            container.appendChild(sheetLabel);
+          }
+
+          const htmlStr = XLSX.utils.sheet_to_html(ws, { editable: false });
+          const wrapper = document.createElement("div");
+          wrapper.innerHTML = htmlStr;
+          container.appendChild(wrapper);
+        }
+      } catch (err) {
+        container.textContent = "Error rendering spreadsheet: " + err.message;
+      }
+    }
+
+    // ---- Grades ----
+    function renderGrades(run) {
+      const section = document.getElementById("grades-section");
+      const content = document.getElementById("grades-content");
+
+      if (!run.grading) {
+        section.style.display = "none";
+        return;
+      }
+
+      const grading = run.grading;
+      section.style.display = "block";
+      // Reset to collapsed
+      content.classList.remove("open");
+      document.getElementById("grades-arrow").classList.remove("open");
+
+      const summary = grading.summary || {};
+      const expectations = grading.expectations || [];
+
+      let html = '<div style="padding: 1rem;">';
+
+      // Summary line
+      const passRate = summary.pass_rate != null
+        ? Math.round(summary.pass_rate * 100) + "%"
+        : "?";
+      const badgeClass = summary.pass_rate >= 0.8 ? "grade-pass" : summary.pass_rate >= 0.5 ? "" : "grade-fail";
+      html += '<div class="grades-summary">';
+      html += '<span class="grade-badge ' + badgeClass + '">' + passRate + '</span>';
+      html += '<span>' + (summary.passed || 0) + ' passed, ' + (summary.failed || 0) + ' failed of ' + (summary.total || 0) + '</span>';
+      html += '</div>';
+
+      // Assertions list
+      html += '<ul class="assertion-list">';
+      for (const exp of expectations) {
+        const statusClass = exp.passed ? "pass" : "fail";
+        const statusIcon = exp.passed ? "\u2713" : "\u2717";
+        html += '<li class="assertion-item">';
+        html += '<span class="assertion-status ' + statusClass + '">' + statusIcon + '</span>';
+        html += '<span>' + escapeHtml(exp.text) + '</span>';
+        if (exp.evidence) {
+          html += '<div class="assertion-evidence">' + escapeHtml(exp.evidence) + '</div>';
+        }
+        html += '</li>';
+      }
+      html += '</ul>';
+
+      html += '</div>';
+      content.innerHTML = html;
+    }
+
+    function toggleGrades() {
+      const content = document.getElementById("grades-content");
+      const arrow = document.getElementById("grades-arrow");
+      content.classList.toggle("open");
+      arrow.classList.toggle("open");
+    }
+
+    // ---- Previous outputs (collapsible) ----
+    function renderPrevOutputs(run) {
+      const section = document.getElementById("prev-outputs-section");
+      const content = document.getElementById("prev-outputs-content");
+      const prevOutputs = (EMBEDDED_DATA.previous_outputs || {})[run.id];
+
+      if (!prevOutputs || prevOutputs.length === 0) {
+        section.style.display = "none";
+        return;
+      }
+
+      section.style.display = "block";
+      // Reset to collapsed
+      content.classList.remove("open");
+      document.getElementById("prev-outputs-arrow").classList.remove("open");
+
+      // Render the files into the content area
+      content.innerHTML = "";
+      const wrapper = document.createElement("div");
+      wrapper.style.padding = "1rem";
+
+      for (const file of prevOutputs) {
+        const fileDiv = document.createElement("div");
+        fileDiv.className = "output-file";
+
+        const header = document.createElement("div");
+        header.className = "output-file-header";
+        const nameSpan = document.createElement("span");
+        nameSpan.textContent = file.name;
+        header.appendChild(nameSpan);
+        const dlBtn = document.createElement("a");
+        dlBtn.className = "dl-btn";
+        dlBtn.textContent = "Download";
+        dlBtn.download = file.name;
+        dlBtn.href = getDownloadUri(file);
+        header.appendChild(dlBtn);
+        fileDiv.appendChild(header);
+
+        const fc = document.createElement("div");
+        fc.className = "output-file-content";
+
+        if (file.type === "text") {
+          const pre = document.createElement("pre");
+          pre.textContent = file.content;
+          fc.appendChild(pre);
+        } else if (file.type === "image") {
+          const img = document.createElement("img");
+          img.src = file.data_uri;
+          img.alt = file.name;
+          fc.appendChild(img);
+        } else if (file.type === "pdf") {
+          const iframe = document.createElement("iframe");
+          iframe.src = file.data_uri;
+          fc.appendChild(iframe);
+        } else if (file.type === "xlsx") {
+          renderXlsx(fc, file.data_b64);
+        } else if (file.type === "binary") {
+          const a = document.createElement("a");
+          a.className = "download-link";
+          a.href = file.data_uri;
+          a.download = file.name;
+          a.textContent = "Download " + file.name;
+          fc.appendChild(a);
+        }
+
+        fileDiv.appendChild(fc);
+        wrapper.appendChild(fileDiv);
+      }
+
+      content.appendChild(wrapper);
+    }
+
+    function togglePrevOutputs() {
+      const content = document.getElementById("prev-outputs-content");
+      const arrow = document.getElementById("prev-outputs-arrow");
+      content.classList.toggle("open");
+      arrow.classList.toggle("open");
+    }
+
+    // ---- Feedback (saved to server -> feedback.json) ----
+    function saveCurrentFeedback() {
+      const run = EMBEDDED_DATA.runs[currentIndex];
+      const text = document.getElementById("feedback").value;
+
+      if (text.trim() === "") {
+        delete feedbackMap[run.id];
+      } else {
+        feedbackMap[run.id] = text;
+      }
+
+      // Build reviews array from map
+      const reviews = [];
+      for (const [run_id, feedback] of Object.entries(feedbackMap)) {
+        if (feedback.trim()) {
+          reviews.push({ run_id, feedback, timestamp: new Date().toISOString() });
+        }
+      }
+
+      fetch("/api/feedback", {
+        method: "POST",
+        headers: { "Content-Type": "application/json" },
+        body: JSON.stringify({ reviews, status: "in_progress" }),
+      }).then(() => {
+        document.getElementById("feedback-status").textContent = "Saved";
+      }).catch(() => {
+        // Static mode or server unavailable — no-op on auto-save,
+        // feedback will be downloaded on final submit
+        document.getElementById("feedback-status").textContent = "Will download on submit";
+      });
+    }
+
+    // ---- Done ----
+    function showDoneDialog() {
+      // Save current textarea to feedbackMap (but don't POST yet)
+      const run = EMBEDDED_DATA.runs[currentIndex];
+      const text = document.getElementById("feedback").value;
+      if (text.trim() === "") {
+        delete feedbackMap[run.id];
+      } else {
+        feedbackMap[run.id] = text;
+      }
+
+      // POST once with status: complete — include ALL runs so the model
+      // can distinguish "no feedback" (looks good) from "not reviewed"
+      const reviews = [];
+      const ts = new Date().toISOString();
+      for (const r of EMBEDDED_DATA.runs) {
+        reviews.push({ run_id: r.id, feedback: feedbackMap[r.id] || "", timestamp: ts });
+      }
+      const payload = JSON.stringify({ reviews, status: "complete" }, null, 2);
+      fetch("/api/feedback", {
+        method: "POST",
+        headers: { "Content-Type": "application/json" },
+        body: payload,
+      }).then(() => {
+        document.getElementById("done-overlay").classList.add("visible");
+      }).catch(() => {
+        // Server not available (static mode) — download as file
+        const blob = new Blob([payload], { type: "application/json" });
+        const url = URL.createObjectURL(blob);
+        const a = document.createElement("a");
+        a.href = url;
+        a.download = "feedback.json";
+        a.click();
+        URL.revokeObjectURL(url);
+        document.getElementById("done-overlay").classList.add("visible");
+      });
+    }
+
+    function closeDoneDialog() {
+      // Reset status back to in_progress
+      saveCurrentFeedback();
+      document.getElementById("done-overlay").classList.remove("visible");
+    }
+
+    // ---- Toast ----
+    function showToast(message) {
+      const toast = document.getElementById("toast");
+      toast.textContent = message;
+      toast.classList.add("visible");
+      setTimeout(() => toast.classList.remove("visible"), 2000);
+    }
+
+    // ---- Keyboard nav ----
+    document.addEventListener("keydown", (e) => {
+      // Don't capture when typing in textarea
+      if (e.target.tagName === "TEXTAREA") return;
+
+      if (e.key === "ArrowLeft" || e.key === "ArrowUp") {
+        e.preventDefault();
+        navigate(-1);
+      } else if (e.key === "ArrowRight" || e.key === "ArrowDown") {
+        e.preventDefault();
+        navigate(1);
+      }
+    });
+
+    // ---- Util ----
+    function getDownloadUri(file) {
+      if (file.data_uri) return file.data_uri;
+      if (file.data_b64) return "data:application/octet-stream;base64," + file.data_b64;
+      if (file.type === "text") return "data:text/plain;charset=utf-8," + encodeURIComponent(file.content);
+      return "#";
+    }
+
+    function escapeHtml(text) {
+      const div = document.createElement("div");
+      div.textContent = text;
+      return div.innerHTML;
+    }
+
+    // ---- View switching ----
+    function switchView(view) {
+      document.querySelectorAll(".view-tab").forEach(t => t.classList.remove("active"));
+      document.querySelectorAll(".view-panel").forEach(p => p.classList.remove("active"));
+      document.querySelector(`[onclick="switchView('${view}')"]`).classList.add("active");
+      document.getElementById("panel-" + view).classList.add("active");
+    }
+
+    // ---- Benchmark rendering ----
+    function renderBenchmark() {
+      const data = EMBEDDED_DATA.benchmark;
+      if (!data) return;
+
+      // Show the tabs
+      document.getElementById("view-tabs").style.display = "flex";
+
+      const container = document.getElementById("benchmark-content");
+      const summary = data.run_summary || {};
+      const metadata = data.metadata || {};
+      const notes = data.notes || [];
+
+      let html = "";
+
+      // Header
+      html += "<h2 style='font-family: Poppins, sans-serif; margin-bottom: 0.5rem;'>Benchmark Results</h2>";
+      html += "<p style='color: var(--text-muted); font-size: 0.875rem; margin-bottom: 1.25rem;'>";
+      if (metadata.skill_name) html += "<strong>" + escapeHtml(metadata.skill_name) + "</strong> &mdash; ";
+      if (metadata.timestamp) html += metadata.timestamp + " &mdash; ";
+      if (metadata.evals_run) html += "Evals: " + metadata.evals_run.join(", ") + " &mdash; ";
+      html += (metadata.runs_per_configuration || "?") + " runs per configuration";
+      html += "</p>";
+
+      // Summary table
+      html += '<table class="benchmark-table">';
+
+      function fmtStat(stat, pct) {
+        if (!stat) return "—";
+        const suffix = pct ? "%" : "";
+        const m = pct ? (stat.mean * 100).toFixed(0) : stat.mean.toFixed(1);
+        const s = pct ? (stat.stddev * 100).toFixed(0) : stat.stddev.toFixed(1);
+        return m + suffix + " ± " + s + suffix;
+      }
+
+      function deltaClass(val) {
+        if (!val) return "";
+        const n = parseFloat(val);
+        if (n > 0) return "benchmark-delta-positive";
+        if (n < 0) return "benchmark-delta-negative";
+        return "";
+      }
+
+      // Discover config names dynamically (everything except "delta")
+      const configs = Object.keys(summary).filter(k => k !== "delta");
+      const configA = configs[0] || "config_a";
+      const configB = configs[1] || "config_b";
+      const labelA = configA.replace(/_/g, " ").replace(/\b\w/g, c => c.toUpperCase());
+      const labelB = configB.replace(/_/g, " ").replace(/\b\w/g, c => c.toUpperCase());
+      const a = summary[configA] || {};
+      const b = summary[configB] || {};
+      const delta = summary.delta || {};
+
+      html += "<thead><tr><th>Metric</th><th>" + escapeHtml(labelA) + "</th><th>" + escapeHtml(labelB) + "</th><th>Delta</th></tr></thead>";
+      html += "<tbody>";
+
+      html += "<tr><td><strong>Pass Rate</strong></td>";
+      html += "<td>" + fmtStat(a.pass_rate, true) + "</td>";
+      html += "<td>" + fmtStat(b.pass_rate, true) + "</td>";
+      html += '<td class="' + deltaClass(delta.pass_rate) + '">' + (delta.pass_rate || "—") + "</td></tr>";
+
+      // Time (only show row if data exists)
+      if (a.time_seconds || b.time_seconds) {
+        html += "<tr><td><strong>Time (s)</strong></td>";
+        html += "<td>" + fmtStat(a.time_seconds, false) + "</td>";
+        html += "<td>" + fmtStat(b.time_seconds, false) + "</td>";
+        html += '<td class="' + deltaClass(delta.time_seconds) + '">' + (delta.time_seconds ? delta.time_seconds + "s" : "—") + "</td></tr>";
+      }
+
+      // Tokens (only show row if data exists)
+      if (a.tokens || b.tokens) {
+        html += "<tr><td><strong>Tokens</strong></td>";
+        html += "<td>" + fmtStat(a.tokens, false) + "</td>";
+        html += "<td>" + fmtStat(b.tokens, false) + "</td>";
+        html += '<td class="' + deltaClass(delta.tokens) + '">' + (delta.tokens || "—") + "</td></tr>";
+      }
+
+      html += "</tbody></table>";
+
+      // Per-eval breakdown (if runs data available)
+      const runs = data.runs || [];
+      if (runs.length > 0) {
+        const evalIds = [...new Set(runs.map(r => r.eval_id))].sort((a, b) => a - b);
+
+        html += "<h3 style='font-family: Poppins, sans-serif; margin-bottom: 0.75rem;'>Per-Eval Breakdown</h3>";
+
+        const hasTime = runs.some(r => r.result && r.result.time_seconds != null);
+        const hasErrors = runs.some(r => r.result && r.result.errors > 0);
+
+        for (const evalId of evalIds) {
+          const evalRuns = runs.filter(r => r.eval_id === evalId);
+          const evalName = evalRuns[0] && evalRuns[0].eval_name ? evalRuns[0].eval_name : "Eval " + evalId;
+
+          html += "<h4 style='font-family: Poppins, sans-serif; margin: 1rem 0 0.5rem; color: var(--text);'>" + escapeHtml(evalName) + "</h4>";
+          html += '<table class="benchmark-table">';
+          html += "<thead><tr><th>Config</th><th>Run</th><th>Pass Rate</th>";
+          if (hasTime) html += "<th>Time (s)</th>";
+          if (hasErrors) html += "<th>Crashes During Execution</th>";
+          html += "</tr></thead>";
+          html += "<tbody>";
+
+          // Group by config and render with average rows
+          const configGroups = [...new Set(evalRuns.map(r => r.configuration))];
+          for (let ci = 0; ci < configGroups.length; ci++) {
+            const config = configGroups[ci];
+            const configRuns = evalRuns.filter(r => r.configuration === config);
+            if (configRuns.length === 0) continue;
+
+            const rowClass = ci === 0 ? "benchmark-row-with" : "benchmark-row-without";
+            const configLabel = config.replace(/_/g, " ").replace(/\b\w/g, c => c.toUpperCase());
+
+            for (const run of configRuns) {
+              const r = run.result || {};
+              const prClass = r.pass_rate >= 0.8 ? "benchmark-delta-positive" : r.pass_rate < 0.5 ? "benchmark-delta-negative" : "";
+              html += '<tr class="' + rowClass + '">';
+              html += "<td>" + configLabel + "</td>";
+              html += "<td>" + run.run_number + "</td>";
+              html += '<td class="' + prClass + '">' + ((r.pass_rate || 0) * 100).toFixed(0) + "% (" + (r.passed || 0) + "/" + (r.total || 0) + ")</td>";
+              if (hasTime) html += "<td>" + (r.time_seconds != null ? r.time_seconds.toFixed(1) : "—") + "</td>";
+              if (hasErrors) html += "<td>" + (r.errors || 0) + "</td>";
+              html += "</tr>";
+            }
+
+            // Average row
+            const rates = configRuns.map(r => (r.result || {}).pass_rate || 0);
+            const avgRate = rates.reduce((a, b) => a + b, 0) / rates.length;
+            const avgPrClass = avgRate >= 0.8 ? "benchmark-delta-positive" : avgRate < 0.5 ? "benchmark-delta-negative" : "";
+            html += '<tr class="benchmark-row-avg ' + rowClass + '">';
+            html += "<td>" + configLabel + "</td>";
+            html += "<td>Avg</td>";
+            html += '<td class="' + avgPrClass + '">' + (avgRate * 100).toFixed(0) + "%</td>";
+            if (hasTime) {
+              const times = configRuns.map(r => (r.result || {}).time_seconds).filter(t => t != null);
+              html += "<td>" + (times.length ? (times.reduce((a, b) => a + b, 0) / times.length).toFixed(1) : "—") + "</td>";
+            }
+            if (hasErrors) html += "<td></td>";
+            html += "</tr>";
+          }
+          html += "</tbody></table>";
+
+          // Per-assertion detail for this eval
+          const runsWithExpectations = {};
+          for (const config of configGroups) {
+            runsWithExpectations[config] = evalRuns.filter(r => r.configuration === config && r.expectations && r.expectations.length > 0);
+          }
+          const hasAnyExpectations = Object.values(runsWithExpectations).some(runs => runs.length > 0);
+          if (hasAnyExpectations) {
+            // Collect all unique assertion texts across all configs
+            const allAssertions = [];
+            const seen = new Set();
+            for (const config of configGroups) {
+              for (const run of runsWithExpectations[config]) {
+                for (const exp of (run.expectations || [])) {
+                  if (!seen.has(exp.text)) {
+                    seen.add(exp.text);
+                    allAssertions.push(exp.text);
+                  }
+                }
+              }
+            }
+
+            html += '<table class="benchmark-table" style="margin-top: 0.5rem;">';
+            html += "<thead><tr><th>Assertion</th>";
+            for (const config of configGroups) {
+              const label = config.replace(/_/g, " ").replace(/\b\w/g, c => c.toUpperCase());
+              html += "<th>" + escapeHtml(label) + "</th>";
+            }
+            html += "</tr></thead><tbody>";
+
+            for (const assertionText of allAssertions) {
+              html += "<tr><td>" + escapeHtml(assertionText) + "</td>";
+
+              for (const config of configGroups) {
+                html += "<td>";
+                for (const run of runsWithExpectations[config]) {
+                  const exp = (run.expectations || []).find(e => e.text === assertionText);
+                  if (exp) {
+                    const cls = exp.passed ? "benchmark-delta-positive" : "benchmark-delta-negative";
+                    const icon = exp.passed ? "\u2713" : "\u2717";
+                    html += '<span class="' + cls + '" title="Run ' + run.run_number + ': ' + escapeHtml(exp.evidence || "") + '">' + icon + "</span> ";
+                  } else {
+                    html += "— ";
+                  }
+                }
+                html += "</td>";
+              }
+              html += "</tr>";
+            }
+            html += "</tbody></table>";
+          }
+        }
+      }
+
+      // Notes
+      if (notes.length > 0) {
+        html += '<div class="benchmark-notes">';
+        html += "<h3>Analysis Notes</h3>";
+        html += "<ul>";
+        for (const note of notes) {
+          html += "<li>" + escapeHtml(note) + "</li>";
+        }
+        html += "</ul></div>";
+      }
+
+      container.innerHTML = html;
+    }
+
+    // ---- Start ----
+    init();
+    renderBenchmark();
+  </script>
+</body>
+</html>
diff --git a/.claude/skills/skill-creator/references/schemas.md b/.claude/skills/skill-creator/references/schemas.md
new file mode 100644
index 0000000..b6eeaa2
--- /dev/null
+++ b/.claude/skills/skill-creator/references/schemas.md
@@ -0,0 +1,430 @@
+# JSON Schemas
+
+This document defines the JSON schemas used by skill-creator.
+
+---
+
+## evals.json
+
+Defines the evals for a skill. Located at `evals/evals.json` within the skill directory.
+
+```json
+{
+  "skill_name": "example-skill",
+  "evals": [
+    {
+      "id": 1,
+      "prompt": "User's example prompt",
+      "expected_output": "Description of expected result",
+      "files": ["evals/files/sample1.pdf"],
+      "expectations": [
+        "The output includes X",
+        "The skill used script Y"
+      ]
+    }
+  ]
+}
+```
+
+**Fields:**
+- `skill_name`: Name matching the skill's frontmatter
+- `evals[].id`: Unique integer identifier
+- `evals[].prompt`: The task to execute
+- `evals[].expected_output`: Human-readable description of success
+- `evals[].files`: Optional list of input file paths (relative to skill root)
+- `evals[].expectations`: List of verifiable statements
+
+---
+
+## history.json
+
+Tracks version progression in Improve mode. Located at workspace root.
+
+```json
+{
+  "started_at": "2026-01-15T10:30:00Z",
+  "skill_name": "pdf",
+  "current_best": "v2",
+  "iterations": [
+    {
+      "version": "v0",
+      "parent": null,
+      "expectation_pass_rate": 0.65,
+      "grading_result": "baseline",
+      "is_current_best": false
+    },
+    {
+      "version": "v1",
+      "parent": "v0",
+      "expectation_pass_rate": 0.75,
+      "grading_result": "won",
+      "is_current_best": false
+    },
+    {
+      "version": "v2",
+      "parent": "v1",
+      "expectation_pass_rate": 0.85,
+      "grading_result": "won",
+      "is_current_best": true
+    }
+  ]
+}
+```
+
+**Fields:**
+- `started_at`: ISO timestamp of when improvement started
+- `skill_name`: Name of the skill being improved
+- `current_best`: Version identifier of the best performer
+- `iterations[].version`: Version identifier (v0, v1, ...)
+- `iterations[].parent`: Parent version this was derived from
+- `iterations[].expectation_pass_rate`: Pass rate from grading
+- `iterations[].grading_result`: "baseline", "won", "lost", or "tie"
+- `iterations[].is_current_best`: Whether this is the current best version
+
+---
+
+## grading.json
+
+Output from the grader agent. Located at `<run-dir>/grading.json`.
+
+```json
+{
+  "expectations": [
+    {
+      "text": "The output includes the name 'John Smith'",
+      "passed": true,
+      "evidence": "Found in transcript Step 3: 'Extracted names: John Smith, Sarah Johnson'"
+    },
+    {
+      "text": "The spreadsheet has a SUM formula in cell B10",
+      "passed": false,
+      "evidence": "No spreadsheet was created. The output was a text file."
+    }
+  ],
+  "summary": {
+    "passed": 2,
+    "failed": 1,
+    "total": 3,
+    "pass_rate": 0.67
+  },
+  "execution_metrics": {
+    "tool_calls": {
+      "Read": 5,
+      "Write": 2,
+      "Bash": 8
+    },
+    "total_tool_calls": 15,
+    "total_steps": 6,
+    "errors_encountered": 0,
+    "output_chars": 12450,
+    "transcript_chars": 3200
+  },
+  "timing": {
+    "executor_duration_seconds": 165.0,
+    "grader_duration_seconds": 26.0,
+    "total_duration_seconds": 191.0
+  },
+  "claims": [
+    {
+      "claim": "The form has 12 fillable fields",
+      "type": "factual",
+      "verified": true,
+      "evidence": "Counted 12 fields in field_info.json"
+    }
+  ],
+  "user_notes_summary": {
+    "uncertainties": ["Used 2023 data, may be stale"],
+    "needs_review": [],
+    "workarounds": ["Fell back to text overlay for non-fillable fields"]
+  },
+  "eval_feedback": {
+    "suggestions": [
+      {
+        "assertion": "The output includes the name 'John Smith'",
+        "reason": "A hallucinated document that mentions the name would also pass"
+      }
+    ],
+    "overall": "Assertions check presence but not correctness."
+  }
+}
+```
+
+**Fields:**
+- `expectations[]`: Graded expectations with evidence
+- `summary`: Aggregate pass/fail counts
+- `execution_metrics`: Tool usage and output size (from executor's metrics.json)
+- `timing`: Wall clock timing (from timing.json)
+- `claims`: Extracted and verified claims from the output
+- `user_notes_summary`: Issues flagged by the executor
+- `eval_feedback`: (optional) Improvement suggestions for the evals, only present when the grader identifies issues worth raising
+
+---
+
+## metrics.json
+
+Output from the executor agent. Located at `<run-dir>/outputs/metrics.json`.
+
+```json
+{
+  "tool_calls": {
+    "Read": 5,
+    "Write": 2,
+    "Bash": 8,
+    "Edit": 1,
+    "Glob": 2,
+    "Grep": 0
+  },
+  "total_tool_calls": 18,
+  "total_steps": 6,
+  "files_created": ["filled_form.pdf", "field_values.json"],
+  "errors_encountered": 0,
+  "output_chars": 12450,
+  "transcript_chars": 3200
+}
+```
+
+**Fields:**
+- `tool_calls`: Count per tool type
+- `total_tool_calls`: Sum of all tool calls
+- `total_steps`: Number of major execution steps
+- `files_created`: List of output files created
+- `errors_encountered`: Number of errors during execution
+- `output_chars`: Total character count of output files
+- `transcript_chars`: Character count of transcript
+
+---
+
+## timing.json
+
+Wall clock timing for a run. Located at `<run-dir>/timing.json`.
+
+**How to capture:** When a subagent task completes, the task notification includes `total_tokens` and `duration_ms`. Save these immediately — they are not persisted anywhere else and cannot be recovered after the fact.
+
+```json
+{
+  "total_tokens": 84852,
+  "duration_ms": 23332,
+  "total_duration_seconds": 23.3,
+  "executor_start": "2026-01-15T10:30:00Z",
+  "executor_end": "2026-01-15T10:32:45Z",
+  "executor_duration_seconds": 165.0,
+  "grader_start": "2026-01-15T10:32:46Z",
+  "grader_end": "2026-01-15T10:33:12Z",
+  "grader_duration_seconds": 26.0
+}
+```
+
+---
+
+## benchmark.json
+
+Output from Benchmark mode. Located at `benchmarks/<timestamp>/benchmark.json`.
+
+```json
+{
+  "metadata": {
+    "skill_name": "pdf",
+    "skill_path": "/path/to/pdf",
+    "executor_model": "claude-sonnet-4-20250514",
+    "analyzer_model": "most-capable-model",
+    "timestamp": "2026-01-15T10:30:00Z",
+    "evals_run": [1, 2, 3],
+    "runs_per_configuration": 3
+  },
+
+  "runs": [
+    {
+      "eval_id": 1,
+      "eval_name": "Ocean",
+      "configuration": "with_skill",
+      "run_number": 1,
+      "result": {
+        "pass_rate": 0.85,
+        "passed": 6,
+        "failed": 1,
+        "total": 7,
+        "time_seconds": 42.5,
+        "tokens": 3800,
+        "tool_calls": 18,
+        "errors": 0
+      },
+      "expectations": [
+        {"text": "...", "passed": true, "evidence": "..."}
+      ],
+      "notes": [
+        "Used 2023 data, may be stale",
+        "Fell back to text overlay for non-fillable fields"
+      ]
+    }
+  ],
+
+  "run_summary": {
+    "with_skill": {
+      "pass_rate": {"mean": 0.85, "stddev": 0.05, "min": 0.80, "max": 0.90},
+      "time_seconds": {"mean": 45.0, "stddev": 12.0, "min": 32.0, "max": 58.0},
+      "tokens": {"mean": 3800, "stddev": 400, "min": 3200, "max": 4100}
+    },
+    "without_skill": {
+      "pass_rate": {"mean": 0.35, "stddev": 0.08, "min": 0.28, "max": 0.45},
+      "time_seconds": {"mean": 32.0, "stddev": 8.0, "min": 24.0, "max": 42.0},
+      "tokens": {"mean": 2100, "stddev": 300, "min": 1800, "max": 2500}
+    },
+    "delta": {
+      "pass_rate": "+0.50",
+      "time_seconds": "+13.0",
+      "tokens": "+1700"
+    }
+  },
+
+  "notes": [
+    "Assertion 'Output is a PDF file' passes 100% in both configurations - may not differentiate skill value",
+    "Eval 3 shows high variance (50% ± 40%) - may be flaky or model-dependent",
+    "Without-skill runs consistently fail on table extraction expectations",
+    "Skill adds 13s average execution time but improves pass rate by 50%"
+  ]
+}
+```
+
+**Fields:**
+- `metadata`: Information about the benchmark run
+  - `skill_name`: Name of the skill
+  - `timestamp`: When the benchmark was run
+  - `evals_run`: List of eval names or IDs
+  - `runs_per_configuration`: Number of runs per config (e.g. 3)
+- `runs[]`: Individual run results
+  - `eval_id`: Numeric eval identifier
+  - `eval_name`: Human-readable eval name (used as section header in the viewer)
+  - `configuration`: Must be `"with_skill"` or `"without_skill"` (the viewer uses this exact string for grouping and color coding)
+  - `run_number`: Integer run number (1, 2, 3...)
+  - `result`: Nested object with `pass_rate`, `passed`, `total`, `time_seconds`, `tokens`, `errors`
+- `run_summary`: Statistical aggregates per configuration
+  - `with_skill` / `without_skill`: Each contains `pass_rate`, `time_seconds`, `tokens` objects with `mean` and `stddev` fields
+  - `delta`: Difference strings like `"+0.50"`, `"+13.0"`, `"+1700"`
+- `notes`: Freeform observations from the analyzer
+
+**Important:** The viewer reads these field names exactly. Using `config` instead of `configuration`, or putting `pass_rate` at the top level of a run instead of nested under `result`, will cause the viewer to show empty/zero values. Always reference this schema when generating benchmark.json manually.
+
+---
+
+## comparison.json
+
+Output from blind comparator. Located at `<grading-dir>/comparison-N.json`.
+
+```json
+{
+  "winner": "A",
+  "reasoning": "Output A provides a complete solution with proper formatting and all required fields. Output B is missing the date field and has formatting inconsistencies.",
+  "rubric": {
+    "A": {
+      "content": {
+        "correctness": 5,
+        "completeness": 5,
+        "accuracy": 4
+      },
+      "structure": {
+        "organization": 4,
+        "formatting": 5,
+        "usability": 4
+      },
+      "content_score": 4.7,
+      "structure_score": 4.3,
+      "overall_score": 9.0
+    },
+    "B": {
+      "content": {
+        "correctness": 3,
+        "completeness": 2,
+        "accuracy": 3
+      },
+      "structure": {
+        "organization": 3,
+        "formatting": 2,
+        "usability": 3
+      },
+      "content_score": 2.7,
+      "structure_score": 2.7,
+      "overall_score": 5.4
+    }
+  },
+  "output_quality": {
+    "A": {
+      "score": 9,
+      "strengths": ["Complete solution", "Well-formatted", "All fields present"],
+      "weaknesses": ["Minor style inconsistency in header"]
+    },
+    "B": {
+      "score": 5,
+      "strengths": ["Readable output", "Correct basic structure"],
+      "weaknesses": ["Missing date field", "Formatting inconsistencies", "Partial data extraction"]
+    }
+  },
+  "expectation_results": {
+    "A": {
+      "passed": 4,
+      "total": 5,
+      "pass_rate": 0.80,
+      "details": [
+        {"text": "Output includes name", "passed": true}
+      ]
+    },
+    "B": {
+      "passed": 3,
+      "total": 5,
+      "pass_rate": 0.60,
+      "details": [
+        {"text": "Output includes name", "passed": true}
+      ]
+    }
+  }
+}
+```
+
+---
+
+## analysis.json
+
+Output from post-hoc analyzer. Located at `<grading-dir>/analysis.json`.
+
+```json
+{
+  "comparison_summary": {
+    "winner": "A",
+    "winner_skill": "path/to/winner/skill",
+    "loser_skill": "path/to/loser/skill",
+    "comparator_reasoning": "Brief summary of why comparator chose winner"
+  },
+  "winner_strengths": [
+    "Clear step-by-step instructions for handling multi-page documents",
+    "Included validation script that caught formatting errors"
+  ],
+  "loser_weaknesses": [
+    "Vague instruction 'process the document appropriately' led to inconsistent behavior",
+    "No script for validation, agent had to improvise"
+  ],
+  "instruction_following": {
+    "winner": {
+      "score": 9,
+      "issues": ["Minor: skipped optional logging step"]
+    },
+    "loser": {
+      "score": 6,
+      "issues": [
+        "Did not use the skill's formatting template",
+        "Invented own approach instead of following step 3"
+      ]
+    }
+  },
+  "improvement_suggestions": [
+    {
+      "priority": "high",
+      "category": "instructions",
+      "suggestion": "Replace 'process the document appropriately' with explicit steps",
+      "expected_impact": "Would eliminate ambiguity that caused inconsistent behavior"
+    }
+  ],
+  "transcript_insights": {
+    "winner_execution_pattern": "Read skill -> Followed 5-step process -> Used validation script",
+    "loser_execution_pattern": "Read skill -> Unclear on approach -> Tried 3 different methods"
+  }
+}
+```
diff --git a/.claude/skills/skill-creator/scripts/__init__.py b/.claude/skills/skill-creator/scripts/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/.claude/skills/skill-creator/scripts/aggregate_benchmark.py b/.claude/skills/skill-creator/scripts/aggregate_benchmark.py
new file mode 100644
index 0000000..3e66e8c
--- /dev/null
+++ b/.claude/skills/skill-creator/scripts/aggregate_benchmark.py
@@ -0,0 +1,401 @@
+#!/usr/bin/env python3
+"""
+Aggregate individual run results into benchmark summary statistics.
+
+Reads grading.json files from run directories and produces:
+- run_summary with mean, stddev, min, max for each metric
+- delta between with_skill and without_skill configurations
+
+Usage:
+    python aggregate_benchmark.py <benchmark_dir>
+
+Example:
+    python aggregate_benchmark.py benchmarks/2026-01-15T10-30-00/
+
+The script supports two directory layouts:
+
+    Workspace layout (from skill-creator iterations):
+    <benchmark_dir>/
+    └── eval-N/
+        ├── with_skill/
+        │   ├── run-1/grading.json
+        │   └── run-2/grading.json
+        └── without_skill/
+            ├── run-1/grading.json
+            └── run-2/grading.json
+
+    Legacy layout (with runs/ subdirectory):
+    <benchmark_dir>/
+    └── runs/
+        └── eval-N/
+            ├── with_skill/
+            │   └── run-1/grading.json
+            └── without_skill/
+                └── run-1/grading.json
+"""
+
+import argparse
+import json
+import math
+import sys
+from datetime import datetime, timezone
+from pathlib import Path
+
+
+def calculate_stats(values: list[float]) -> dict:
+    """Calculate mean, stddev, min, max for a list of values."""
+    if not values:
+        return {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0}
+
+    n = len(values)
+    mean = sum(values) / n
+
+    if n > 1:
+        variance = sum((x - mean) ** 2 for x in values) / (n - 1)
+        stddev = math.sqrt(variance)
+    else:
+        stddev = 0.0
+
+    return {
+        "mean": round(mean, 4),
+        "stddev": round(stddev, 4),
+        "min": round(min(values), 4),
+        "max": round(max(values), 4)
+    }
+
+
+def load_run_results(benchmark_dir: Path) -> dict:
+    """
+    Load all run results from a benchmark directory.
+
+    Returns dict keyed by config name (e.g. "with_skill"/"without_skill",
+    or "new_skill"/"old_skill"), each containing a list of run results.
+    """
+    # Support both layouts: eval dirs directly under benchmark_dir, or under runs/
+    runs_dir = benchmark_dir / "runs"
+    if runs_dir.exists():
+        search_dir = runs_dir
+    elif list(benchmark_dir.glob("eval-*")):
+        search_dir = benchmark_dir
+    else:
+        print(f"No eval directories found in {benchmark_dir} or {benchmark_dir / 'runs'}")
+        return {}
+
+    results: dict[str, list] = {}
+
+    for eval_idx, eval_dir in enumerate(sorted(search_dir.glob("eval-*"))):
+        metadata_path = eval_dir / "eval_metadata.json"
+        if metadata_path.exists():
+            try:
+                with open(metadata_path) as mf:
+                    eval_id = json.load(mf).get("eval_id", eval_idx)
+            except (json.JSONDecodeError, OSError):
+                eval_id = eval_idx
+        else:
+            try:
+                eval_id = int(eval_dir.name.split("-")[1])
+            except ValueError:
+                eval_id = eval_idx
+
+        # Discover config directories dynamically rather than hardcoding names
+        for config_dir in sorted(eval_dir.iterdir()):
+            if not config_dir.is_dir():
+                continue
+            # Skip non-config directories (inputs, outputs, etc.)
+            if not list(config_dir.glob("run-*")):
+                continue
+            config = config_dir.name
+            if config not in results:
+                results[config] = []
+
+            for run_dir in sorted(config_dir.glob("run-*")):
+                run_number = int(run_dir.name.split("-")[1])
+                grading_file = run_dir / "grading.json"
+
+                if not grading_file.exists():
+                    print(f"Warning: grading.json not found in {run_dir}")
+                    continue
+
+                try:
+                    with open(grading_file) as f:
+                        grading = json.load(f)
+                except json.JSONDecodeError as e:
+                    print(f"Warning: Invalid JSON in {grading_file}: {e}")
+                    continue
+
+                # Extract metrics
+                result = {
+                    "eval_id": eval_id,
+                    "run_number": run_number,
+                    "pass_rate": grading.get("summary", {}).get("pass_rate", 0.0),
+                    "passed": grading.get("summary", {}).get("passed", 0),
+                    "failed": grading.get("summary", {}).get("failed", 0),
+                    "total": grading.get("summary", {}).get("total", 0),
+                }
+
+                # Extract timing — check grading.json first, then sibling timing.json
+                timing = grading.get("timing", {})
+                result["time_seconds"] = timing.get("total_duration_seconds", 0.0)
+                timing_file = run_dir / "timing.json"
+                if result["time_seconds"] == 0.0 and timing_file.exists():
+                    try:
+                        with open(timing_file) as tf:
+                            timing_data = json.load(tf)
+                        result["time_seconds"] = timing_data.get("total_duration_seconds", 0.0)
+                        result["tokens"] = timing_data.get("total_tokens", 0)
+                    except json.JSONDecodeError:
+                        pass
+
+                # Extract metrics if available
+                metrics = grading.get("execution_metrics", {})
+                result["tool_calls"] = metrics.get("total_tool_calls", 0)
+                if not result.get("tokens"):
+                    result["tokens"] = metrics.get("output_chars", 0)
+                result["errors"] = metrics.get("errors_encountered", 0)
+
+                # Extract expectations — viewer requires fields: text, passed, evidence
+                raw_expectations = grading.get("expectations", [])
+                for exp in raw_expectations:
+                    if "text" not in exp or "passed" not in exp:
+                        print(f"Warning: expectation in {grading_file} missing required fields (text, passed, evidence): {exp}")
+                result["expectations"] = raw_expectations
+
+                # Extract notes from user_notes_summary
+                notes_summary = grading.get("user_notes_summary", {})
+                notes = []
+                notes.extend(notes_summary.get("uncertainties", []))
+                notes.extend(notes_summary.get("needs_review", []))
+                notes.extend(notes_summary.get("workarounds", []))
+                result["notes"] = notes
+
+                results[config].append(result)
+
+    return results
+
+
+def aggregate_results(results: dict) -> dict:
+    """
+    Aggregate run results into summary statistics.
+
+    Returns run_summary with stats for each configuration and delta.
+    """
+    run_summary = {}
+    configs = list(results.keys())
+
+    for config in configs:
+        runs = results.get(config, [])
+
+        if not runs:
+            run_summary[config] = {
+                "pass_rate": {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0},
+                "time_seconds": {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0},
+                "tokens": {"mean": 0, "stddev": 0, "min": 0, "max": 0}
+            }
+            continue
+
+        pass_rates = [r["pass_rate"] for r in runs]
+        times = [r["time_seconds"] for r in runs]
+        tokens = [r.get("tokens", 0) for r in runs]
+
+        run_summary[config] = {
+            "pass_rate": calculate_stats(pass_rates),
+            "time_seconds": calculate_stats(times),
+            "tokens": calculate_stats(tokens)
+        }
+
+    # Calculate delta between the first two configs (if two exist)
+    if len(configs) >= 2:
+        primary = run_summary.get(configs[0], {})
+        baseline = run_summary.get(configs[1], {})
+    else:
+        primary = run_summary.get(configs[0], {}) if configs else {}
+        baseline = {}
+
+    delta_pass_rate = primary.get("pass_rate", {}).get("mean", 0) - baseline.get("pass_rate", {}).get("mean", 0)
+    delta_time = primary.get("time_seconds", {}).get("mean", 0) - baseline.get("time_seconds", {}).get("mean", 0)
+    delta_tokens = primary.get("tokens", {}).get("mean", 0) - baseline.get("tokens", {}).get("mean", 0)
+
+    run_summary["delta"] = {
+        "pass_rate": f"{delta_pass_rate:+.2f}",
+        "time_seconds": f"{delta_time:+.1f}",
+        "tokens": f"{delta_tokens:+.0f}"
+    }
+
+    return run_summary
+
+
+def generate_benchmark(benchmark_dir: Path, skill_name: str = "", skill_path: str = "") -> dict:
+    """
+    Generate complete benchmark.json from run results.
+    """
+    results = load_run_results(benchmark_dir)
+    run_summary = aggregate_results(results)
+
+    # Build runs array for benchmark.json
+    runs = []
+    for config in results:
+        for result in results[config]:
+            runs.append({
+                "eval_id": result["eval_id"],
+                "configuration": config,
+                "run_number": result["run_number"],
+                "result": {
+                    "pass_rate": result["pass_rate"],
+                    "passed": result["passed"],
+                    "failed": result["failed"],
+                    "total": result["total"],
+                    "time_seconds": result["time_seconds"],
+                    "tokens": result.get("tokens", 0),
+                    "tool_calls": result.get("tool_calls", 0),
+                    "errors": result.get("errors", 0)
+                },
+                "expectations": result["expectations"],
+                "notes": result["notes"]
+            })
+
+    # Determine eval IDs from results
+    eval_ids = sorted(set(
+        r["eval_id"]
+        for config in results.values()
+        for r in config
+    ))
+
+    benchmark = {
+        "metadata": {
+            "skill_name": skill_name or "<skill-name>",
+            "skill_path": skill_path or "<path/to/skill>",
+            "executor_model": "<model-name>",
+            "analyzer_model": "<model-name>",
+            "timestamp": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
+            "evals_run": eval_ids,
+            "runs_per_configuration": 3
+        },
+        "runs": runs,
+        "run_summary": run_summary,
+        "notes": []  # To be filled by analyzer
+    }
+
+    return benchmark
+
+
+def generate_markdown(benchmark: dict) -> str:
+    """Generate human-readable benchmark.md from benchmark data."""
+    metadata = benchmark["metadata"]
+    run_summary = benchmark["run_summary"]
+
+    # Determine config names (excluding "delta")
+    configs = [k for k in run_summary if k != "delta"]
+    config_a = configs[0] if len(configs) >= 1 else "config_a"
+    config_b = configs[1] if len(configs) >= 2 else "config_b"
+    label_a = config_a.replace("_", " ").title()
+    label_b = config_b.replace("_", " ").title()
+
+    lines = [
+        f"# Skill Benchmark: {metadata['skill_name']}",
+        "",
+        f"**Model**: {metadata['executor_model']}",
+        f"**Date**: {metadata['timestamp']}",
+        f"**Evals**: {', '.join(map(str, metadata['evals_run']))} ({metadata['runs_per_configuration']} runs each per configuration)",
+        "",
+        "## Summary",
+        "",
+        f"| Metric | {label_a} | {label_b} | Delta |",
+        "|--------|------------|---------------|-------|",
+    ]
+
+    a_summary = run_summary.get(config_a, {})
+    b_summary = run_summary.get(config_b, {})
+    delta = run_summary.get("delta", {})
+
+    # Format pass rate
+    a_pr = a_summary.get("pass_rate", {})
+    b_pr = b_summary.get("pass_rate", {})
+    lines.append(f"| Pass Rate | {a_pr.get('mean', 0)*100:.0f}% ± {a_pr.get('stddev', 0)*100:.0f}% | {b_pr.get('mean', 0)*100:.0f}% ± {b_pr.get('stddev', 0)*100:.0f}% | {delta.get('pass_rate', '—')} |")
+
+    # Format time
+    a_time = a_summary.get("time_seconds", {})
+    b_time = b_summary.get("time_seconds", {})
+    lines.append(f"| Time | {a_time.get('mean', 0):.1f}s ± {a_time.get('stddev', 0):.1f}s | {b_time.get('mean', 0):.1f}s ± {b_time.get('stddev', 0):.1f}s | {delta.get('time_seconds', '—')}s |")
+
+    # Format tokens
+    a_tokens = a_summary.get("tokens", {})
+    b_tokens = b_summary.get("tokens", {})
+    lines.append(f"| Tokens | {a_tokens.get('mean', 0):.0f} ± {a_tokens.get('stddev', 0):.0f} | {b_tokens.get('mean', 0):.0f} ± {b_tokens.get('stddev', 0):.0f} | {delta.get('tokens', '—')} |")
+
+    # Notes section
+    if benchmark.get("notes"):
+        lines.extend([
+            "",
+            "## Notes",
+            ""
+        ])
+        for note in benchmark["notes"]:
+            lines.append(f"- {note}")
+
+    return "\n".join(lines)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Aggregate benchmark run results into summary statistics"
+    )
+    parser.add_argument(
+        "benchmark_dir",
+        type=Path,
+        help="Path to the benchmark directory"
+    )
+    parser.add_argument(
+        "--skill-name",
+        default="",
+        help="Name of the skill being benchmarked"
+    )
+    parser.add_argument(
+        "--skill-path",
+        default="",
+        help="Path to the skill being benchmarked"
+    )
+    parser.add_argument(
+        "--output", "-o",
+        type=Path,
+        help="Output path for benchmark.json (default: <benchmark_dir>/benchmark.json)"
+    )
+
+    args = parser.parse_args()
+
+    if not args.benchmark_dir.exists():
+        print(f"Directory not found: {args.benchmark_dir}")
+        sys.exit(1)
+
+    # Generate benchmark
+    benchmark = generate_benchmark(args.benchmark_dir, args.skill_name, args.skill_path)
+
+    # Determine output paths
+    output_json = args.output or (args.benchmark_dir / "benchmark.json")
+    output_md = output_json.with_suffix(".md")
+
+    # Write benchmark.json
+    with open(output_json, "w") as f:
+        json.dump(benchmark, f, indent=2)
+    print(f"Generated: {output_json}")
+
+    # Write benchmark.md
+    markdown = generate_markdown(benchmark)
+    with open(output_md, "w") as f:
+        f.write(markdown)
+    print(f"Generated: {output_md}")
+
+    # Print summary
+    run_summary = benchmark["run_summary"]
+    configs = [k for k in run_summary if k != "delta"]
+    delta = run_summary.get("delta", {})
+
+    print(f"\nSummary:")
+    for config in configs:
+        pr = run_summary[config]["pass_rate"]["mean"]
+        label = config.replace("_", " ").title()
+        print(f"  {label}: {pr*100:.1f}% pass rate")
+    print(f"  Delta:         {delta.get('pass_rate', '—')}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.claude/skills/skill-creator/scripts/generate_report.py b/.claude/skills/skill-creator/scripts/generate_report.py
new file mode 100644
index 0000000..959e30a
--- /dev/null
+++ b/.claude/skills/skill-creator/scripts/generate_report.py
@@ -0,0 +1,326 @@
+#!/usr/bin/env python3
+"""Generate an HTML report from run_loop.py output.
+
+Takes the JSON output from run_loop.py and generates a visual HTML report
+showing each description attempt with check/x for each test case.
+Distinguishes between train and test queries.
+"""
+
+import argparse
+import html
+import json
+import sys
+from pathlib import Path
+
+
+def generate_html(data: dict, auto_refresh: bool = False, skill_name: str = "") -> str:
+    """Generate HTML report from loop output data. If auto_refresh is True, adds a meta refresh tag."""
+    history = data.get("history", [])
+    holdout = data.get("holdout", 0)
+    title_prefix = html.escape(skill_name + " \u2014 ") if skill_name else ""
+
+    # Get all unique queries from train and test sets, with should_trigger info
+    train_queries: list[dict] = []
+    test_queries: list[dict] = []
+    if history:
+        for r in history[0].get("train_results", history[0].get("results", [])):
+            train_queries.append({"query": r["query"], "should_trigger": r.get("should_trigger", True)})
+        if history[0].get("test_results"):
+            for r in history[0].get("test_results", []):
+                test_queries.append({"query": r["query"], "should_trigger": r.get("should_trigger", True)})
+
+    refresh_tag = '    <meta http-equiv="refresh" content="5">\n' if auto_refresh else ""
+
+    html_parts = ["""<!DOCTYPE html>
+<html>
+<head>
+    <meta charset="utf-8">
+""" + refresh_tag + """    <title>""" + title_prefix + """Skill Description Optimization</title>
+    <link rel="preconnect" href="https://fonts.googleapis.com">
+    <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+    <link href="https://fonts.googleapis.com/css2?family=Poppins:wght@500;600&family=Lora:wght@400;500&display=swap" rel="stylesheet">
+    <style>
+        body {
+            font-family: 'Lora', Georgia, serif;
+            max-width: 100%;
+            margin: 0 auto;
+            padding: 20px;
+            background: #faf9f5;
+            color: #141413;
+        }
+        h1 { font-family: 'Poppins', sans-serif; color: #141413; }
+        .explainer {
+            background: white;
+            padding: 15px;
+            border-radius: 6px;
+            margin-bottom: 20px;
+            border: 1px solid #e8e6dc;
+            color: #b0aea5;
+            font-size: 0.875rem;
+            line-height: 1.6;
+        }
+        .summary {
+            background: white;
+            padding: 15px;
+            border-radius: 6px;
+            margin-bottom: 20px;
+            border: 1px solid #e8e6dc;
+        }
+        .summary p { margin: 5px 0; }
+        .best { color: #788c5d; font-weight: bold; }
+        .table-container {
+            overflow-x: auto;
+            width: 100%;
+        }
+        table {
+            border-collapse: collapse;
+            background: white;
+            border: 1px solid #e8e6dc;
+            border-radius: 6px;
+            font-size: 12px;
+            min-width: 100%;
+        }
+        th, td {
+            padding: 8px;
+            text-align: left;
+            border: 1px solid #e8e6dc;
+            white-space: normal;
+            word-wrap: break-word;
+        }
+        th {
+            font-family: 'Poppins', sans-serif;
+            background: #141413;
+            color: #faf9f5;
+            font-weight: 500;
+        }
+        th.test-col {
+            background: #6a9bcc;
+        }
+        th.query-col { min-width: 200px; }
+        td.description {
+            font-family: monospace;
+            font-size: 11px;
+            word-wrap: break-word;
+            max-width: 400px;
+        }
+        td.result {
+            text-align: center;
+            font-size: 16px;
+            min-width: 40px;
+        }
+        td.test-result {
+            background: #f0f6fc;
+        }
+        .pass { color: #788c5d; }
+        .fail { color: #c44; }
+        .rate {
+            font-size: 9px;
+            color: #b0aea5;
+            display: block;
+        }
+        tr:hover { background: #faf9f5; }
+        .score {
+            display: inline-block;
+            padding: 2px 6px;
+            border-radius: 4px;
+            font-weight: bold;
+            font-size: 11px;
+        }
+        .score-good { background: #eef2e8; color: #788c5d; }
+        .score-ok { background: #fef3c7; color: #d97706; }
+        .score-bad { background: #fceaea; color: #c44; }
+        .train-label { color: #b0aea5; font-size: 10px; }
+        .test-label { color: #6a9bcc; font-size: 10px; font-weight: bold; }
+        .best-row { background: #f5f8f2; }
+        th.positive-col { border-bottom: 3px solid #788c5d; }
+        th.negative-col { border-bottom: 3px solid #c44; }
+        th.test-col.positive-col { border-bottom: 3px solid #788c5d; }
+        th.test-col.negative-col { border-bottom: 3px solid #c44; }
+        .legend { font-family: 'Poppins', sans-serif; display: flex; gap: 20px; margin-bottom: 10px; font-size: 13px; align-items: center; }
+        .legend-item { display: flex; align-items: center; gap: 6px; }
+        .legend-swatch { width: 16px; height: 16px; border-radius: 3px; display: inline-block; }
+        .swatch-positive { background: #141413; border-bottom: 3px solid #788c5d; }
+        .swatch-negative { background: #141413; border-bottom: 3px solid #c44; }
+        .swatch-test { background: #6a9bcc; }
+        .swatch-train { background: #141413; }
+    </style>
+</head>
+<body>
+    <h1>""" + title_prefix + """Skill Description Optimization</h1>
+    <div class="explainer">
+        <strong>Optimizing your skill's description.</strong> This page updates automatically as Claude tests different versions of your skill's description. Each row is an iteration — a new description attempt. The columns show test queries: green checkmarks mean the skill triggered correctly (or correctly didn't trigger), red crosses mean it got it wrong. The "Train" score shows performance on queries used to improve the description; the "Test" score shows performance on held-out queries the optimizer hasn't seen. When it's done, Claude will apply the best-performing description to your skill.
+    </div>
+"""]
+
+    # Summary section
+    best_test_score = data.get('best_test_score')
+    best_train_score = data.get('best_train_score')
+    html_parts.append(f"""
+    <div class="summary">
+        <p><strong>Original:</strong> {html.escape(data.get('original_description', 'N/A'))}</p>
+        <p class="best"><strong>Best:</strong> {html.escape(data.get('best_description', 'N/A'))}</p>
+        <p><strong>Best Score:</strong> {data.get('best_score', 'N/A')} {'(test)' if best_test_score else '(train)'}</p>
+        <p><strong>Iterations:</strong> {data.get('iterations_run', 0)} | <strong>Train:</strong> {data.get('train_size', '?')} | <strong>Test:</strong> {data.get('test_size', '?')}</p>
+    </div>
+""")
+
+    # Legend
+    html_parts.append("""
+    <div class="legend">
+        <span style="font-weight:600">Query columns:</span>
+        <span class="legend-item"><span class="legend-swatch swatch-positive"></span> Should trigger</span>
+        <span class="legend-item"><span class="legend-swatch swatch-negative"></span> Should NOT trigger</span>
+        <span class="legend-item"><span class="legend-swatch swatch-train"></span> Train</span>
+        <span class="legend-item"><span class="legend-swatch swatch-test"></span> Test</span>
+    </div>
+""")
+
+    # Table header
+    html_parts.append("""
+    <div class="table-container">
+    <table>
+        <thead>
+            <tr>
+                <th>Iter</th>
+                <th>Train</th>
+                <th>Test</th>
+                <th class="query-col">Description</th>
+""")
+
+    # Add column headers for train queries
+    for qinfo in train_queries:
+        polarity = "positive-col" if qinfo["should_trigger"] else "negative-col"
+        html_parts.append(f'                <th class="{polarity}">{html.escape(qinfo["query"])}</th>\n')
+
+    # Add column headers for test queries (different color)
+    for qinfo in test_queries:
+        polarity = "positive-col" if qinfo["should_trigger"] else "negative-col"
+        html_parts.append(f'                <th class="test-col {polarity}">{html.escape(qinfo["query"])}</th>\n')
+
+    html_parts.append("""            </tr>
+        </thead>
+        <tbody>
+""")
+
+    # Find best iteration for highlighting
+    if test_queries:
+        best_iter = max(history, key=lambda h: h.get("test_passed") or 0).get("iteration")
+    else:
+        best_iter = max(history, key=lambda h: h.get("train_passed", h.get("passed", 0))).get("iteration")
+
+    # Add rows for each iteration
+    for h in history:
+        iteration = h.get("iteration", "?")
+        train_passed = h.get("train_passed", h.get("passed", 0))
+        train_total = h.get("train_total", h.get("total", 0))
+        test_passed = h.get("test_passed")
+        test_total = h.get("test_total")
+        description = h.get("description", "")
+        train_results = h.get("train_results", h.get("results", []))
+        test_results = h.get("test_results", [])
+
+        # Create lookups for results by query
+        train_by_query = {r["query"]: r for r in train_results}
+        test_by_query = {r["query"]: r for r in test_results} if test_results else {}
+
+        # Compute aggregate correct/total runs across all retries
+        def aggregate_runs(results: list[dict]) -> tuple[int, int]:
+            correct = 0
+            total = 0
+            for r in results:
+                runs = r.get("runs", 0)
+                triggers = r.get("triggers", 0)
+                total += runs
+                if r.get("should_trigger", True):
+                    correct += triggers
+                else:
+                    correct += runs - triggers
+            return correct, total
+
+        train_correct, train_runs = aggregate_runs(train_results)
+        test_correct, test_runs = aggregate_runs(test_results)
+
+        # Determine score classes
+        def score_class(correct: int, total: int) -> str:
+            if total > 0:
+                ratio = correct / total
+                if ratio >= 0.8:
+                    return "score-good"
+                elif ratio >= 0.5:
+                    return "score-ok"
+            return "score-bad"
+
+        train_class = score_class(train_correct, train_runs)
+        test_class = score_class(test_correct, test_runs)
+
+        row_class = "best-row" if iteration == best_iter else ""
+
+        html_parts.append(f"""            <tr class="{row_class}">
+                <td>{iteration}</td>
+                <td><span class="score {train_class}">{train_correct}/{train_runs}</span></td>
+                <td><span class="score {test_class}">{test_correct}/{test_runs}</span></td>
+                <td class="description">{html.escape(description)}</td>
+""")
+
+        # Add result for each train query
+        for qinfo in train_queries:
+            r = train_by_query.get(qinfo["query"], {})
+            did_pass = r.get("pass", False)
+            triggers = r.get("triggers", 0)
+            runs = r.get("runs", 0)
+
+            icon = "✓" if did_pass else "✗"
+            css_class = "pass" if did_pass else "fail"
+
+            html_parts.append(f'                <td class="result {css_class}">{icon}<span class="rate">{triggers}/{runs}</span></td>\n')
+
+        # Add result for each test query (with different background)
+        for qinfo in test_queries:
+            r = test_by_query.get(qinfo["query"], {})
+            did_pass = r.get("pass", False)
+            triggers = r.get("triggers", 0)
+            runs = r.get("runs", 0)
+
+            icon = "✓" if did_pass else "✗"
+            css_class = "pass" if did_pass else "fail"
+
+            html_parts.append(f'                <td class="result test-result {css_class}">{icon}<span class="rate">{triggers}/{runs}</span></td>\n')
+
+        html_parts.append("            </tr>\n")
+
+    html_parts.append("""        </tbody>
+    </table>
+    </div>
+""")
+
+    html_parts.append("""
+</body>
+</html>
+""")
+
+    return "".join(html_parts)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Generate HTML report from run_loop output")
+    parser.add_argument("input", help="Path to JSON output from run_loop.py (or - for stdin)")
+    parser.add_argument("-o", "--output", default=None, help="Output HTML file (default: stdout)")
+    parser.add_argument("--skill-name", default="", help="Skill name to include in the report title")
+    args = parser.parse_args()
+
+    if args.input == "-":
+        data = json.load(sys.stdin)
+    else:
+        data = json.loads(Path(args.input).read_text())
+
+    html_output = generate_html(data, skill_name=args.skill_name)
+
+    if args.output:
+        Path(args.output).write_text(html_output)
+        print(f"Report written to {args.output}", file=sys.stderr)
+    else:
+        print(html_output)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.claude/skills/skill-creator/scripts/improve_description.py b/.claude/skills/skill-creator/scripts/improve_description.py
new file mode 100644
index 0000000..06bcec7
--- /dev/null
+++ b/.claude/skills/skill-creator/scripts/improve_description.py
@@ -0,0 +1,247 @@
+#!/usr/bin/env python3
+"""Improve a skill description based on eval results.
+
+Takes eval results (from run_eval.py) and generates an improved description
+by calling `claude -p` as a subprocess (same auth pattern as run_eval.py —
+uses the session's Claude Code auth, no separate ANTHROPIC_API_KEY needed).
+"""
+
+import argparse
+import json
+import os
+import re
+import subprocess
+import sys
+from pathlib import Path
+
+from scripts.utils import parse_skill_md
+
+
+def _call_claude(prompt: str, model: str | None, timeout: int = 300) -> str:
+    """Run `claude -p` with the prompt on stdin and return the text response.
+
+    Prompt goes over stdin (not argv) because it embeds the full SKILL.md
+    body and can easily exceed comfortable argv length.
+    """
+    cmd = ["claude", "-p", "--output-format", "text"]
+    if model:
+        cmd.extend(["--model", model])
+
+    # Remove CLAUDECODE env var to allow nesting claude -p inside a
+    # Claude Code session. The guard is for interactive terminal conflicts;
+    # programmatic subprocess usage is safe. Same pattern as run_eval.py.
+    env = {k: v for k, v in os.environ.items() if k != "CLAUDECODE"}
+
+    result = subprocess.run(
+        cmd,
+        input=prompt,
+        capture_output=True,
+        text=True,
+        env=env,
+        timeout=timeout,
+    )
+    if result.returncode != 0:
+        raise RuntimeError(
+            f"claude -p exited {result.returncode}\nstderr: {result.stderr}"
+        )
+    return result.stdout
+
+
+def improve_description(
+    skill_name: str,
+    skill_content: str,
+    current_description: str,
+    eval_results: dict,
+    history: list[dict],
+    model: str,
+    test_results: dict | None = None,
+    log_dir: Path | None = None,
+    iteration: int | None = None,
+) -> str:
+    """Call Claude to improve the description based on eval results."""
+    failed_triggers = [
+        r for r in eval_results["results"]
+        if r["should_trigger"] and not r["pass"]
+    ]
+    false_triggers = [
+        r for r in eval_results["results"]
+        if not r["should_trigger"] and not r["pass"]
+    ]
+
+    # Build scores summary
+    train_score = f"{eval_results['summary']['passed']}/{eval_results['summary']['total']}"
+    if test_results:
+        test_score = f"{test_results['summary']['passed']}/{test_results['summary']['total']}"
+        scores_summary = f"Train: {train_score}, Test: {test_score}"
+    else:
+        scores_summary = f"Train: {train_score}"
+
+    prompt = f"""You are optimizing a skill description for a Claude Code skill called "{skill_name}". A "skill" is sort of like a prompt, but with progressive disclosure -- there's a title and description that Claude sees when deciding whether to use the skill, and then if it does use the skill, it reads the .md file which has lots more details and potentially links to other resources in the skill folder like helper files and scripts and additional documentation or examples.
+
+The description appears in Claude's "available_skills" list. When a user sends a query, Claude decides whether to invoke the skill based solely on the title and on this description. Your goal is to write a description that triggers for relevant queries, and doesn't trigger for irrelevant ones.
+
+Here's the current description:
+<current_description>
+"{current_description}"
+</current_description>
+
+Current scores ({scores_summary}):
+<scores_summary>
+"""
+    if failed_triggers:
+        prompt += "FAILED TO TRIGGER (should have triggered but didn't):\n"
+        for r in failed_triggers:
+            prompt += f'  - "{r["query"]}" (triggered {r["triggers"]}/{r["runs"]} times)\n'
+        prompt += "\n"
+
+    if false_triggers:
+        prompt += "FALSE TRIGGERS (triggered but shouldn't have):\n"
+        for r in false_triggers:
+            prompt += f'  - "{r["query"]}" (triggered {r["triggers"]}/{r["runs"]} times)\n'
+        prompt += "\n"
+
+    if history:
+        prompt += "PREVIOUS ATTEMPTS (do NOT repeat these — try something structurally different):\n\n"
+        for h in history:
+            train_s = f"{h.get('train_passed', h.get('passed', 0))}/{h.get('train_total', h.get('total', 0))}"
+            test_s = f"{h.get('test_passed', '?')}/{h.get('test_total', '?')}" if h.get('test_passed') is not None else None
+            score_str = f"train={train_s}" + (f", test={test_s}" if test_s else "")
+            prompt += f'<attempt {score_str}>\n'
+            prompt += f'Description: "{h["description"]}"\n'
+            if "results" in h:
+                prompt += "Train results:\n"
+                for r in h["results"]:
+                    status = "PASS" if r["pass"] else "FAIL"
+                    prompt += f'  [{status}] "{r["query"][:80]}" (triggered {r["triggers"]}/{r["runs"]})\n'
+            if h.get("note"):
+                prompt += f'Note: {h["note"]}\n'
+            prompt += "</attempt>\n\n"
+
+    prompt += f"""</scores_summary>
+
+Skill content (for context on what the skill does):
+<skill_content>
+{skill_content}
+</skill_content>
+
+Based on the failures, write a new and improved description that is more likely to trigger correctly. When I say "based on the failures", it's a bit of a tricky line to walk because we don't want to overfit to the specific cases you're seeing. So what I DON'T want you to do is produce an ever-expanding list of specific queries that this skill should or shouldn't trigger for. Instead, try to generalize from the failures to broader categories of user intent and situations where this skill would be useful or not useful. The reason for this is twofold:
+
+1. Avoid overfitting
+2. The list might get loooong and it's injected into ALL queries and there might be a lot of skills, so we don't want to blow too much space on any given description.
+
+Concretely, your description should not be more than about 100-200 words, even if that comes at the cost of accuracy. There is a hard limit of 1024 characters — descriptions over that will be truncated, so stay comfortably under it.
+
+Here are some tips that we've found to work well in writing these descriptions:
+- The skill should be phrased in the imperative -- "Use this skill for" rather than "this skill does"
+- The skill description should focus on the user's intent, what they are trying to achieve, vs. the implementation details of how the skill works.
+- The description competes with other skills for Claude's attention — make it distinctive and immediately recognizable.
+- If you're getting lots of failures after repeated attempts, change things up. Try different sentence structures or wordings.
+
+I'd encourage you to be creative and mix up the style in different iterations since you'll have multiple opportunities to try different approaches and we'll just grab the highest-scoring one at the end. 
+
+Please respond with only the new description text in <new_description> tags, nothing else."""
+
+    text = _call_claude(prompt, model)
+
+    match = re.search(r"<new_description>(.*?)</new_description>", text, re.DOTALL)
+    description = match.group(1).strip().strip('"') if match else text.strip().strip('"')
+
+    transcript: dict = {
+        "iteration": iteration,
+        "prompt": prompt,
+        "response": text,
+        "parsed_description": description,
+        "char_count": len(description),
+        "over_limit": len(description) > 1024,
+    }
+
+    # Safety net: the prompt already states the 1024-char hard limit, but if
+    # the model blew past it anyway, make one fresh single-turn call that
+    # quotes the too-long version and asks for a shorter rewrite. (The old
+    # SDK path did this as a true multi-turn; `claude -p` is one-shot, so we
+    # inline the prior output into the new prompt instead.)
+    if len(description) > 1024:
+        shorten_prompt = (
+            f"{prompt}\n\n"
+            f"---\n\n"
+            f"A previous attempt produced this description, which at "
+            f"{len(description)} characters is over the 1024-character hard limit:\n\n"
+            f'"{description}"\n\n'
+            f"Rewrite it to be under 1024 characters while keeping the most "
+            f"important trigger words and intent coverage. Respond with only "
+            f"the new description in <new_description> tags."
+        )
+        shorten_text = _call_claude(shorten_prompt, model)
+        match = re.search(r"<new_description>(.*?)</new_description>", shorten_text, re.DOTALL)
+        shortened = match.group(1).strip().strip('"') if match else shorten_text.strip().strip('"')
+
+        transcript["rewrite_prompt"] = shorten_prompt
+        transcript["rewrite_response"] = shorten_text
+        transcript["rewrite_description"] = shortened
+        transcript["rewrite_char_count"] = len(shortened)
+        description = shortened
+
+    transcript["final_description"] = description
+
+    if log_dir:
+        log_dir.mkdir(parents=True, exist_ok=True)
+        log_file = log_dir / f"improve_iter_{iteration or 'unknown'}.json"
+        log_file.write_text(json.dumps(transcript, indent=2))
+
+    return description
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Improve a skill description based on eval results")
+    parser.add_argument("--eval-results", required=True, help="Path to eval results JSON (from run_eval.py)")
+    parser.add_argument("--skill-path", required=True, help="Path to skill directory")
+    parser.add_argument("--history", default=None, help="Path to history JSON (previous attempts)")
+    parser.add_argument("--model", required=True, help="Model for improvement")
+    parser.add_argument("--verbose", action="store_true", help="Print thinking to stderr")
+    args = parser.parse_args()
+
+    skill_path = Path(args.skill_path)
+    if not (skill_path / "SKILL.md").exists():
+        print(f"Error: No SKILL.md found at {skill_path}", file=sys.stderr)
+        sys.exit(1)
+
+    eval_results = json.loads(Path(args.eval_results).read_text())
+    history = []
+    if args.history:
+        history = json.loads(Path(args.history).read_text())
+
+    name, _, content = parse_skill_md(skill_path)
+    current_description = eval_results["description"]
+
+    if args.verbose:
+        print(f"Current: {current_description}", file=sys.stderr)
+        print(f"Score: {eval_results['summary']['passed']}/{eval_results['summary']['total']}", file=sys.stderr)
+
+    new_description = improve_description(
+        skill_name=name,
+        skill_content=content,
+        current_description=current_description,
+        eval_results=eval_results,
+        history=history,
+        model=args.model,
+    )
+
+    if args.verbose:
+        print(f"Improved: {new_description}", file=sys.stderr)
+
+    # Output as JSON with both the new description and updated history
+    output = {
+        "description": new_description,
+        "history": history + [{
+            "description": current_description,
+            "passed": eval_results["summary"]["passed"],
+            "failed": eval_results["summary"]["failed"],
+            "total": eval_results["summary"]["total"],
+            "results": eval_results["results"],
+        }],
+    }
+    print(json.dumps(output, indent=2))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.claude/skills/skill-creator/scripts/package_skill.py b/.claude/skills/skill-creator/scripts/package_skill.py
new file mode 100644
index 0000000..f48eac4
--- /dev/null
+++ b/.claude/skills/skill-creator/scripts/package_skill.py
@@ -0,0 +1,136 @@
+#!/usr/bin/env python3
+"""
+Skill Packager - Creates a distributable .skill file of a skill folder
+
+Usage:
+    python utils/package_skill.py <path/to/skill-folder> [output-directory]
+
+Example:
+    python utils/package_skill.py skills/public/my-skill
+    python utils/package_skill.py skills/public/my-skill ./dist
+"""
+
+import fnmatch
+import sys
+import zipfile
+from pathlib import Path
+from scripts.quick_validate import validate_skill
+
+# Patterns to exclude when packaging skills.
+EXCLUDE_DIRS = {"__pycache__", "node_modules"}
+EXCLUDE_GLOBS = {"*.pyc"}
+EXCLUDE_FILES = {".DS_Store"}
+# Directories excluded only at the skill root (not when nested deeper).
+ROOT_EXCLUDE_DIRS = {"evals"}
+
+
+def should_exclude(rel_path: Path) -> bool:
+    """Check if a path should be excluded from packaging."""
+    parts = rel_path.parts
+    if any(part in EXCLUDE_DIRS for part in parts):
+        return True
+    # rel_path is relative to skill_path.parent, so parts[0] is the skill
+    # folder name and parts[1] (if present) is the first subdir.
+    if len(parts) > 1 and parts[1] in ROOT_EXCLUDE_DIRS:
+        return True
+    name = rel_path.name
+    if name in EXCLUDE_FILES:
+        return True
+    return any(fnmatch.fnmatch(name, pat) for pat in EXCLUDE_GLOBS)
+
+
+def package_skill(skill_path, output_dir=None):
+    """
+    Package a skill folder into a .skill file.
+
+    Args:
+        skill_path: Path to the skill folder
+        output_dir: Optional output directory for the .skill file (defaults to current directory)
+
+    Returns:
+        Path to the created .skill file, or None if error
+    """
+    skill_path = Path(skill_path).resolve()
+
+    # Validate skill folder exists
+    if not skill_path.exists():
+        print(f"❌ Error: Skill folder not found: {skill_path}")
+        return None
+
+    if not skill_path.is_dir():
+        print(f"❌ Error: Path is not a directory: {skill_path}")
+        return None
+
+    # Validate SKILL.md exists
+    skill_md = skill_path / "SKILL.md"
+    if not skill_md.exists():
+        print(f"❌ Error: SKILL.md not found in {skill_path}")
+        return None
+
+    # Run validation before packaging
+    print("🔍 Validating skill...")
+    valid, message = validate_skill(skill_path)
+    if not valid:
+        print(f"❌ Validation failed: {message}")
+        print("   Please fix the validation errors before packaging.")
+        return None
+    print(f"✅ {message}\n")
+
+    # Determine output location
+    skill_name = skill_path.name
+    if output_dir:
+        output_path = Path(output_dir).resolve()
+        output_path.mkdir(parents=True, exist_ok=True)
+    else:
+        output_path = Path.cwd()
+
+    skill_filename = output_path / f"{skill_name}.skill"
+
+    # Create the .skill file (zip format)
+    try:
+        with zipfile.ZipFile(skill_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
+            # Walk through the skill directory, excluding build artifacts
+            for file_path in skill_path.rglob('*'):
+                if not file_path.is_file():
+                    continue
+                arcname = file_path.relative_to(skill_path.parent)
+                if should_exclude(arcname):
+                    print(f"  Skipped: {arcname}")
+                    continue
+                zipf.write(file_path, arcname)
+                print(f"  Added: {arcname}")
+
+        print(f"\n✅ Successfully packaged skill to: {skill_filename}")
+        return skill_filename
+
+    except Exception as e:
+        print(f"❌ Error creating .skill file: {e}")
+        return None
+
+
+def main():
+    if len(sys.argv) < 2:
+        print("Usage: python utils/package_skill.py <path/to/skill-folder> [output-directory]")
+        print("\nExample:")
+        print("  python utils/package_skill.py skills/public/my-skill")
+        print("  python utils/package_skill.py skills/public/my-skill ./dist")
+        sys.exit(1)
+
+    skill_path = sys.argv[1]
+    output_dir = sys.argv[2] if len(sys.argv) > 2 else None
+
+    print(f"📦 Packaging skill: {skill_path}")
+    if output_dir:
+        print(f"   Output directory: {output_dir}")
+    print()
+
+    result = package_skill(skill_path, output_dir)
+
+    if result:
+        sys.exit(0)
+    else:
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.claude/skills/skill-creator/scripts/quick_validate.py b/.claude/skills/skill-creator/scripts/quick_validate.py
new file mode 100644
index 0000000..ed8e1dd
--- /dev/null
+++ b/.claude/skills/skill-creator/scripts/quick_validate.py
@@ -0,0 +1,103 @@
+#!/usr/bin/env python3
+"""
+Quick validation script for skills - minimal version
+"""
+
+import sys
+import os
+import re
+import yaml
+from pathlib import Path
+
+def validate_skill(skill_path):
+    """Basic validation of a skill"""
+    skill_path = Path(skill_path)
+
+    # Check SKILL.md exists
+    skill_md = skill_path / 'SKILL.md'
+    if not skill_md.exists():
+        return False, "SKILL.md not found"
+
+    # Read and validate frontmatter
+    content = skill_md.read_text()
+    if not content.startswith('---'):
+        return False, "No YAML frontmatter found"
+
+    # Extract frontmatter
+    match = re.match(r'^---\n(.*?)\n---', content, re.DOTALL)
+    if not match:
+        return False, "Invalid frontmatter format"
+
+    frontmatter_text = match.group(1)
+
+    # Parse YAML frontmatter
+    try:
+        frontmatter = yaml.safe_load(frontmatter_text)
+        if not isinstance(frontmatter, dict):
+            return False, "Frontmatter must be a YAML dictionary"
+    except yaml.YAMLError as e:
+        return False, f"Invalid YAML in frontmatter: {e}"
+
+    # Define allowed properties
+    ALLOWED_PROPERTIES = {'name', 'description', 'license', 'allowed-tools', 'metadata', 'compatibility'}
+
+    # Check for unexpected properties (excluding nested keys under metadata)
+    unexpected_keys = set(frontmatter.keys()) - ALLOWED_PROPERTIES
+    if unexpected_keys:
+        return False, (
+            f"Unexpected key(s) in SKILL.md frontmatter: {', '.join(sorted(unexpected_keys))}. "
+            f"Allowed properties are: {', '.join(sorted(ALLOWED_PROPERTIES))}"
+        )
+
+    # Check required fields
+    if 'name' not in frontmatter:
+        return False, "Missing 'name' in frontmatter"
+    if 'description' not in frontmatter:
+        return False, "Missing 'description' in frontmatter"
+
+    # Extract name for validation
+    name = frontmatter.get('name', '')
+    if not isinstance(name, str):
+        return False, f"Name must be a string, got {type(name).__name__}"
+    name = name.strip()
+    if name:
+        # Check naming convention (kebab-case: lowercase with hyphens)
+        if not re.match(r'^[a-z0-9-]+$', name):
+            return False, f"Name '{name}' should be kebab-case (lowercase letters, digits, and hyphens only)"
+        if name.startswith('-') or name.endswith('-') or '--' in name:
+            return False, f"Name '{name}' cannot start/end with hyphen or contain consecutive hyphens"
+        # Check name length (max 64 characters per spec)
+        if len(name) > 64:
+            return False, f"Name is too long ({len(name)} characters). Maximum is 64 characters."
+
+    # Extract and validate description
+    description = frontmatter.get('description', '')
+    if not isinstance(description, str):
+        return False, f"Description must be a string, got {type(description).__name__}"
+    description = description.strip()
+    if description:
+        # Check for angle brackets
+        if '<' in description or '>' in description:
+            return False, "Description cannot contain angle brackets (< or >)"
+        # Check description length (max 1024 characters per spec)
+        if len(description) > 1024:
+            return False, f"Description is too long ({len(description)} characters). Maximum is 1024 characters."
+
+    # Validate compatibility field if present (optional)
+    compatibility = frontmatter.get('compatibility', '')
+    if compatibility:
+        if not isinstance(compatibility, str):
+            return False, f"Compatibility must be a string, got {type(compatibility).__name__}"
+        if len(compatibility) > 500:
+            return False, f"Compatibility is too long ({len(compatibility)} characters). Maximum is 500 characters."
+
+    return True, "Skill is valid!"
+
+if __name__ == "__main__":
+    if len(sys.argv) != 2:
+        print("Usage: python quick_validate.py <skill_directory>")
+        sys.exit(1)
+    
+    valid, message = validate_skill(sys.argv[1])
+    print(message)
+    sys.exit(0 if valid else 1)
\ No newline at end of file
diff --git a/.claude/skills/skill-creator/scripts/run_eval.py b/.claude/skills/skill-creator/scripts/run_eval.py
new file mode 100644
index 0000000..e58c70b
--- /dev/null
+++ b/.claude/skills/skill-creator/scripts/run_eval.py
@@ -0,0 +1,310 @@
+#!/usr/bin/env python3
+"""Run trigger evaluation for a skill description.
+
+Tests whether a skill's description causes Claude to trigger (read the skill)
+for a set of queries. Outputs results as JSON.
+"""
+
+import argparse
+import json
+import os
+import select
+import subprocess
+import sys
+import time
+import uuid
+from concurrent.futures import ProcessPoolExecutor, as_completed
+from pathlib import Path
+
+from scripts.utils import parse_skill_md
+
+
+def find_project_root() -> Path:
+    """Find the project root by walking up from cwd looking for .claude/.
+
+    Mimics how Claude Code discovers its project root, so the command file
+    we create ends up where claude -p will look for it.
+    """
+    current = Path.cwd()
+    for parent in [current, *current.parents]:
+        if (parent / ".claude").is_dir():
+            return parent
+    return current
+
+
+def run_single_query(
+    query: str,
+    skill_name: str,
+    skill_description: str,
+    timeout: int,
+    project_root: str,
+    model: str | None = None,
+) -> bool:
+    """Run a single query and return whether the skill was triggered.
+
+    Creates a command file in .claude/commands/ so it appears in Claude's
+    available_skills list, then runs `claude -p` with the raw query.
+    Uses --include-partial-messages to detect triggering early from
+    stream events (content_block_start) rather than waiting for the
+    full assistant message, which only arrives after tool execution.
+    """
+    unique_id = uuid.uuid4().hex[:8]
+    clean_name = f"{skill_name}-skill-{unique_id}"
+    project_commands_dir = Path(project_root) / ".claude" / "commands"
+    command_file = project_commands_dir / f"{clean_name}.md"
+
+    try:
+        project_commands_dir.mkdir(parents=True, exist_ok=True)
+        # Use YAML block scalar to avoid breaking on quotes in description
+        indented_desc = "\n  ".join(skill_description.split("\n"))
+        command_content = (
+            f"---\n"
+            f"description: |\n"
+            f"  {indented_desc}\n"
+            f"---\n\n"
+            f"# {skill_name}\n\n"
+            f"This skill handles: {skill_description}\n"
+        )
+        command_file.write_text(command_content)
+
+        cmd = [
+            "claude",
+            "-p", query,
+            "--output-format", "stream-json",
+            "--verbose",
+            "--include-partial-messages",
+        ]
+        if model:
+            cmd.extend(["--model", model])
+
+        # Remove CLAUDECODE env var to allow nesting claude -p inside a
+        # Claude Code session. The guard is for interactive terminal conflicts;
+        # programmatic subprocess usage is safe.
+        env = {k: v for k, v in os.environ.items() if k != "CLAUDECODE"}
+
+        process = subprocess.Popen(
+            cmd,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.DEVNULL,
+            cwd=project_root,
+            env=env,
+        )
+
+        triggered = False
+        start_time = time.time()
+        buffer = ""
+        # Track state for stream event detection
+        pending_tool_name = None
+        accumulated_json = ""
+
+        try:
+            while time.time() - start_time < timeout:
+                if process.poll() is not None:
+                    remaining = process.stdout.read()
+                    if remaining:
+                        buffer += remaining.decode("utf-8", errors="replace")
+                    break
+
+                ready, _, _ = select.select([process.stdout], [], [], 1.0)
+                if not ready:
+                    continue
+
+                chunk = os.read(process.stdout.fileno(), 8192)
+                if not chunk:
+                    break
+                buffer += chunk.decode("utf-8", errors="replace")
+
+                while "\n" in buffer:
+                    line, buffer = buffer.split("\n", 1)
+                    line = line.strip()
+                    if not line:
+                        continue
+
+                    try:
+                        event = json.loads(line)
+                    except json.JSONDecodeError:
+                        continue
+
+                    # Early detection via stream events
+                    if event.get("type") == "stream_event":
+                        se = event.get("event", {})
+                        se_type = se.get("type", "")
+
+                        if se_type == "content_block_start":
+                            cb = se.get("content_block", {})
+                            if cb.get("type") == "tool_use":
+                                tool_name = cb.get("name", "")
+                                if tool_name in ("Skill", "Read"):
+                                    pending_tool_name = tool_name
+                                    accumulated_json = ""
+                                else:
+                                    return False
+
+                        elif se_type == "content_block_delta" and pending_tool_name:
+                            delta = se.get("delta", {})
+                            if delta.get("type") == "input_json_delta":
+                                accumulated_json += delta.get("partial_json", "")
+                                if clean_name in accumulated_json:
+                                    return True
+
+                        elif se_type in ("content_block_stop", "message_stop"):
+                            if pending_tool_name:
+                                return clean_name in accumulated_json
+                            if se_type == "message_stop":
+                                return False
+
+                    # Fallback: full assistant message
+                    elif event.get("type") == "assistant":
+                        message = event.get("message", {})
+                        for content_item in message.get("content", []):
+                            if content_item.get("type") != "tool_use":
+                                continue
+                            tool_name = content_item.get("name", "")
+                            tool_input = content_item.get("input", {})
+                            if tool_name == "Skill" and clean_name in tool_input.get("skill", ""):
+                                triggered = True
+                            elif tool_name == "Read" and clean_name in tool_input.get("file_path", ""):
+                                triggered = True
+                            return triggered
+
+                    elif event.get("type") == "result":
+                        return triggered
+        finally:
+            # Clean up process on any exit path (return, exception, timeout)
+            if process.poll() is None:
+                process.kill()
+                process.wait()
+
+        return triggered
+    finally:
+        if command_file.exists():
+            command_file.unlink()
+
+
+def run_eval(
+    eval_set: list[dict],
+    skill_name: str,
+    description: str,
+    num_workers: int,
+    timeout: int,
+    project_root: Path,
+    runs_per_query: int = 1,
+    trigger_threshold: float = 0.5,
+    model: str | None = None,
+) -> dict:
+    """Run the full eval set and return results."""
+    results = []
+
+    with ProcessPoolExecutor(max_workers=num_workers) as executor:
+        future_to_info = {}
+        for item in eval_set:
+            for run_idx in range(runs_per_query):
+                future = executor.submit(
+                    run_single_query,
+                    item["query"],
+                    skill_name,
+                    description,
+                    timeout,
+                    str(project_root),
+                    model,
+                )
+                future_to_info[future] = (item, run_idx)
+
+        query_triggers: dict[str, list[bool]] = {}
+        query_items: dict[str, dict] = {}
+        for future in as_completed(future_to_info):
+            item, _ = future_to_info[future]
+            query = item["query"]
+            query_items[query] = item
+            if query not in query_triggers:
+                query_triggers[query] = []
+            try:
+                query_triggers[query].append(future.result())
+            except Exception as e:
+                print(f"Warning: query failed: {e}", file=sys.stderr)
+                query_triggers[query].append(False)
+
+    for query, triggers in query_triggers.items():
+        item = query_items[query]
+        trigger_rate = sum(triggers) / len(triggers)
+        should_trigger = item["should_trigger"]
+        if should_trigger:
+            did_pass = trigger_rate >= trigger_threshold
+        else:
+            did_pass = trigger_rate < trigger_threshold
+        results.append({
+            "query": query,
+            "should_trigger": should_trigger,
+            "trigger_rate": trigger_rate,
+            "triggers": sum(triggers),
+            "runs": len(triggers),
+            "pass": did_pass,
+        })
+
+    passed = sum(1 for r in results if r["pass"])
+    total = len(results)
+
+    return {
+        "skill_name": skill_name,
+        "description": description,
+        "results": results,
+        "summary": {
+            "total": total,
+            "passed": passed,
+            "failed": total - passed,
+        },
+    }
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Run trigger evaluation for a skill description")
+    parser.add_argument("--eval-set", required=True, help="Path to eval set JSON file")
+    parser.add_argument("--skill-path", required=True, help="Path to skill directory")
+    parser.add_argument("--description", default=None, help="Override description to test")
+    parser.add_argument("--num-workers", type=int, default=10, help="Number of parallel workers")
+    parser.add_argument("--timeout", type=int, default=30, help="Timeout per query in seconds")
+    parser.add_argument("--runs-per-query", type=int, default=3, help="Number of runs per query")
+    parser.add_argument("--trigger-threshold", type=float, default=0.5, help="Trigger rate threshold")
+    parser.add_argument("--model", default=None, help="Model to use for claude -p (default: user's configured model)")
+    parser.add_argument("--verbose", action="store_true", help="Print progress to stderr")
+    args = parser.parse_args()
+
+    eval_set = json.loads(Path(args.eval_set).read_text())
+    skill_path = Path(args.skill_path)
+
+    if not (skill_path / "SKILL.md").exists():
+        print(f"Error: No SKILL.md found at {skill_path}", file=sys.stderr)
+        sys.exit(1)
+
+    name, original_description, content = parse_skill_md(skill_path)
+    description = args.description or original_description
+    project_root = find_project_root()
+
+    if args.verbose:
+        print(f"Evaluating: {description}", file=sys.stderr)
+
+    output = run_eval(
+        eval_set=eval_set,
+        skill_name=name,
+        description=description,
+        num_workers=args.num_workers,
+        timeout=args.timeout,
+        project_root=project_root,
+        runs_per_query=args.runs_per_query,
+        trigger_threshold=args.trigger_threshold,
+        model=args.model,
+    )
+
+    if args.verbose:
+        summary = output["summary"]
+        print(f"Results: {summary['passed']}/{summary['total']} passed", file=sys.stderr)
+        for r in output["results"]:
+            status = "PASS" if r["pass"] else "FAIL"
+            rate_str = f"{r['triggers']}/{r['runs']}"
+            print(f"  [{status}] rate={rate_str} expected={r['should_trigger']}: {r['query'][:70]}", file=sys.stderr)
+
+    print(json.dumps(output, indent=2))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.claude/skills/skill-creator/scripts/run_loop.py b/.claude/skills/skill-creator/scripts/run_loop.py
new file mode 100644
index 0000000..30a263d
--- /dev/null
+++ b/.claude/skills/skill-creator/scripts/run_loop.py
@@ -0,0 +1,328 @@
+#!/usr/bin/env python3
+"""Run the eval + improve loop until all pass or max iterations reached.
+
+Combines run_eval.py and improve_description.py in a loop, tracking history
+and returning the best description found. Supports train/test split to prevent
+overfitting.
+"""
+
+import argparse
+import json
+import random
+import sys
+import tempfile
+import time
+import webbrowser
+from pathlib import Path
+
+from scripts.generate_report import generate_html
+from scripts.improve_description import improve_description
+from scripts.run_eval import find_project_root, run_eval
+from scripts.utils import parse_skill_md
+
+
+def split_eval_set(eval_set: list[dict], holdout: float, seed: int = 42) -> tuple[list[dict], list[dict]]:
+    """Split eval set into train and test sets, stratified by should_trigger."""
+    random.seed(seed)
+
+    # Separate by should_trigger
+    trigger = [e for e in eval_set if e["should_trigger"]]
+    no_trigger = [e for e in eval_set if not e["should_trigger"]]
+
+    # Shuffle each group
+    random.shuffle(trigger)
+    random.shuffle(no_trigger)
+
+    # Calculate split points
+    n_trigger_test = max(1, int(len(trigger) * holdout))
+    n_no_trigger_test = max(1, int(len(no_trigger) * holdout))
+
+    # Split
+    test_set = trigger[:n_trigger_test] + no_trigger[:n_no_trigger_test]
+    train_set = trigger[n_trigger_test:] + no_trigger[n_no_trigger_test:]
+
+    return train_set, test_set
+
+
+def run_loop(
+    eval_set: list[dict],
+    skill_path: Path,
+    description_override: str | None,
+    num_workers: int,
+    timeout: int,
+    max_iterations: int,
+    runs_per_query: int,
+    trigger_threshold: float,
+    holdout: float,
+    model: str,
+    verbose: bool,
+    live_report_path: Path | None = None,
+    log_dir: Path | None = None,
+) -> dict:
+    """Run the eval + improvement loop."""
+    project_root = find_project_root()
+    name, original_description, content = parse_skill_md(skill_path)
+    current_description = description_override or original_description
+
+    # Split into train/test if holdout > 0
+    if holdout > 0:
+        train_set, test_set = split_eval_set(eval_set, holdout)
+        if verbose:
+            print(f"Split: {len(train_set)} train, {len(test_set)} test (holdout={holdout})", file=sys.stderr)
+    else:
+        train_set = eval_set
+        test_set = []
+
+    history = []
+    exit_reason = "unknown"
+
+    for iteration in range(1, max_iterations + 1):
+        if verbose:
+            print(f"\n{'='*60}", file=sys.stderr)
+            print(f"Iteration {iteration}/{max_iterations}", file=sys.stderr)
+            print(f"Description: {current_description}", file=sys.stderr)
+            print(f"{'='*60}", file=sys.stderr)
+
+        # Evaluate train + test together in one batch for parallelism
+        all_queries = train_set + test_set
+        t0 = time.time()
+        all_results = run_eval(
+            eval_set=all_queries,
+            skill_name=name,
+            description=current_description,
+            num_workers=num_workers,
+            timeout=timeout,
+            project_root=project_root,
+            runs_per_query=runs_per_query,
+            trigger_threshold=trigger_threshold,
+            model=model,
+        )
+        eval_elapsed = time.time() - t0
+
+        # Split results back into train/test by matching queries
+        train_queries_set = {q["query"] for q in train_set}
+        train_result_list = [r for r in all_results["results"] if r["query"] in train_queries_set]
+        test_result_list = [r for r in all_results["results"] if r["query"] not in train_queries_set]
+
+        train_passed = sum(1 for r in train_result_list if r["pass"])
+        train_total = len(train_result_list)
+        train_summary = {"passed": train_passed, "failed": train_total - train_passed, "total": train_total}
+        train_results = {"results": train_result_list, "summary": train_summary}
+
+        if test_set:
+            test_passed = sum(1 for r in test_result_list if r["pass"])
+            test_total = len(test_result_list)
+            test_summary = {"passed": test_passed, "failed": test_total - test_passed, "total": test_total}
+            test_results = {"results": test_result_list, "summary": test_summary}
+        else:
+            test_results = None
+            test_summary = None
+
+        history.append({
+            "iteration": iteration,
+            "description": current_description,
+            "train_passed": train_summary["passed"],
+            "train_failed": train_summary["failed"],
+            "train_total": train_summary["total"],
+            "train_results": train_results["results"],
+            "test_passed": test_summary["passed"] if test_summary else None,
+            "test_failed": test_summary["failed"] if test_summary else None,
+            "test_total": test_summary["total"] if test_summary else None,
+            "test_results": test_results["results"] if test_results else None,
+            # For backward compat with report generator
+            "passed": train_summary["passed"],
+            "failed": train_summary["failed"],
+            "total": train_summary["total"],
+            "results": train_results["results"],
+        })
+
+        # Write live report if path provided
+        if live_report_path:
+            partial_output = {
+                "original_description": original_description,
+                "best_description": current_description,
+                "best_score": "in progress",
+                "iterations_run": len(history),
+                "holdout": holdout,
+                "train_size": len(train_set),
+                "test_size": len(test_set),
+                "history": history,
+            }
+            live_report_path.write_text(generate_html(partial_output, auto_refresh=True, skill_name=name))
+
+        if verbose:
+            def print_eval_stats(label, results, elapsed):
+                pos = [r for r in results if r["should_trigger"]]
+                neg = [r for r in results if not r["should_trigger"]]
+                tp = sum(r["triggers"] for r in pos)
+                pos_runs = sum(r["runs"] for r in pos)
+                fn = pos_runs - tp
+                fp = sum(r["triggers"] for r in neg)
+                neg_runs = sum(r["runs"] for r in neg)
+                tn = neg_runs - fp
+                total = tp + tn + fp + fn
+                precision = tp / (tp + fp) if (tp + fp) > 0 else 1.0
+                recall = tp / (tp + fn) if (tp + fn) > 0 else 1.0
+                accuracy = (tp + tn) / total if total > 0 else 0.0
+                print(f"{label}: {tp+tn}/{total} correct, precision={precision:.0%} recall={recall:.0%} accuracy={accuracy:.0%} ({elapsed:.1f}s)", file=sys.stderr)
+                for r in results:
+                    status = "PASS" if r["pass"] else "FAIL"
+                    rate_str = f"{r['triggers']}/{r['runs']}"
+                    print(f"  [{status}] rate={rate_str} expected={r['should_trigger']}: {r['query'][:60]}", file=sys.stderr)
+
+            print_eval_stats("Train", train_results["results"], eval_elapsed)
+            if test_summary:
+                print_eval_stats("Test ", test_results["results"], 0)
+
+        if train_summary["failed"] == 0:
+            exit_reason = f"all_passed (iteration {iteration})"
+            if verbose:
+                print(f"\nAll train queries passed on iteration {iteration}!", file=sys.stderr)
+            break
+
+        if iteration == max_iterations:
+            exit_reason = f"max_iterations ({max_iterations})"
+            if verbose:
+                print(f"\nMax iterations reached ({max_iterations}).", file=sys.stderr)
+            break
+
+        # Improve the description based on train results
+        if verbose:
+            print(f"\nImproving description...", file=sys.stderr)
+
+        t0 = time.time()
+        # Strip test scores from history so improvement model can't see them
+        blinded_history = [
+            {k: v for k, v in h.items() if not k.startswith("test_")}
+            for h in history
+        ]
+        new_description = improve_description(
+            skill_name=name,
+            skill_content=content,
+            current_description=current_description,
+            eval_results=train_results,
+            history=blinded_history,
+            model=model,
+            log_dir=log_dir,
+            iteration=iteration,
+        )
+        improve_elapsed = time.time() - t0
+
+        if verbose:
+            print(f"Proposed ({improve_elapsed:.1f}s): {new_description}", file=sys.stderr)
+
+        current_description = new_description
+
+    # Find the best iteration by TEST score (or train if no test set)
+    if test_set:
+        best = max(history, key=lambda h: h["test_passed"] or 0)
+        best_score = f"{best['test_passed']}/{best['test_total']}"
+    else:
+        best = max(history, key=lambda h: h["train_passed"])
+        best_score = f"{best['train_passed']}/{best['train_total']}"
+
+    if verbose:
+        print(f"\nExit reason: {exit_reason}", file=sys.stderr)
+        print(f"Best score: {best_score} (iteration {best['iteration']})", file=sys.stderr)
+
+    return {
+        "exit_reason": exit_reason,
+        "original_description": original_description,
+        "best_description": best["description"],
+        "best_score": best_score,
+        "best_train_score": f"{best['train_passed']}/{best['train_total']}",
+        "best_test_score": f"{best['test_passed']}/{best['test_total']}" if test_set else None,
+        "final_description": current_description,
+        "iterations_run": len(history),
+        "holdout": holdout,
+        "train_size": len(train_set),
+        "test_size": len(test_set),
+        "history": history,
+    }
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Run eval + improve loop")
+    parser.add_argument("--eval-set", required=True, help="Path to eval set JSON file")
+    parser.add_argument("--skill-path", required=True, help="Path to skill directory")
+    parser.add_argument("--description", default=None, help="Override starting description")
+    parser.add_argument("--num-workers", type=int, default=10, help="Number of parallel workers")
+    parser.add_argument("--timeout", type=int, default=30, help="Timeout per query in seconds")
+    parser.add_argument("--max-iterations", type=int, default=5, help="Max improvement iterations")
+    parser.add_argument("--runs-per-query", type=int, default=3, help="Number of runs per query")
+    parser.add_argument("--trigger-threshold", type=float, default=0.5, help="Trigger rate threshold")
+    parser.add_argument("--holdout", type=float, default=0.4, help="Fraction of eval set to hold out for testing (0 to disable)")
+    parser.add_argument("--model", required=True, help="Model for improvement")
+    parser.add_argument("--verbose", action="store_true", help="Print progress to stderr")
+    parser.add_argument("--report", default="auto", help="Generate HTML report at this path (default: 'auto' for temp file, 'none' to disable)")
+    parser.add_argument("--results-dir", default=None, help="Save all outputs (results.json, report.html, log.txt) to a timestamped subdirectory here")
+    args = parser.parse_args()
+
+    eval_set = json.loads(Path(args.eval_set).read_text())
+    skill_path = Path(args.skill_path)
+
+    if not (skill_path / "SKILL.md").exists():
+        print(f"Error: No SKILL.md found at {skill_path}", file=sys.stderr)
+        sys.exit(1)
+
+    name, _, _ = parse_skill_md(skill_path)
+
+    # Set up live report path
+    if args.report != "none":
+        if args.report == "auto":
+            timestamp = time.strftime("%Y%m%d_%H%M%S")
+            live_report_path = Path(tempfile.gettempdir()) / f"skill_description_report_{skill_path.name}_{timestamp}.html"
+        else:
+            live_report_path = Path(args.report)
+        # Open the report immediately so the user can watch
+        live_report_path.write_text("<html><body><h1>Starting optimization loop...</h1><meta http-equiv='refresh' content='5'></body></html>")
+        webbrowser.open(str(live_report_path))
+    else:
+        live_report_path = None
+
+    # Determine output directory (create before run_loop so logs can be written)
+    if args.results_dir:
+        timestamp = time.strftime("%Y-%m-%d_%H%M%S")
+        results_dir = Path(args.results_dir) / timestamp
+        results_dir.mkdir(parents=True, exist_ok=True)
+    else:
+        results_dir = None
+
+    log_dir = results_dir / "logs" if results_dir else None
+
+    output = run_loop(
+        eval_set=eval_set,
+        skill_path=skill_path,
+        description_override=args.description,
+        num_workers=args.num_workers,
+        timeout=args.timeout,
+        max_iterations=args.max_iterations,
+        runs_per_query=args.runs_per_query,
+        trigger_threshold=args.trigger_threshold,
+        holdout=args.holdout,
+        model=args.model,
+        verbose=args.verbose,
+        live_report_path=live_report_path,
+        log_dir=log_dir,
+    )
+
+    # Save JSON output
+    json_output = json.dumps(output, indent=2)
+    print(json_output)
+    if results_dir:
+        (results_dir / "results.json").write_text(json_output)
+
+    # Write final HTML report (without auto-refresh)
+    if live_report_path:
+        live_report_path.write_text(generate_html(output, auto_refresh=False, skill_name=name))
+        print(f"\nReport: {live_report_path}", file=sys.stderr)
+
+    if results_dir and live_report_path:
+        (results_dir / "report.html").write_text(generate_html(output, auto_refresh=False, skill_name=name))
+
+    if results_dir:
+        print(f"Results saved to: {results_dir}", file=sys.stderr)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.claude/skills/skill-creator/scripts/utils.py b/.claude/skills/skill-creator/scripts/utils.py
new file mode 100644
index 0000000..51b6a07
--- /dev/null
+++ b/.claude/skills/skill-creator/scripts/utils.py
@@ -0,0 +1,47 @@
+"""Shared utilities for skill-creator scripts."""
+
+from pathlib import Path
+
+
+
+def parse_skill_md(skill_path: Path) -> tuple[str, str, str]:
+    """Parse a SKILL.md file, returning (name, description, full_content)."""
+    content = (skill_path / "SKILL.md").read_text()
+    lines = content.split("\n")
+
+    if lines[0].strip() != "---":
+        raise ValueError("SKILL.md missing frontmatter (no opening ---)")
+
+    end_idx = None
+    for i, line in enumerate(lines[1:], start=1):
+        if line.strip() == "---":
+            end_idx = i
+            break
+
+    if end_idx is None:
+        raise ValueError("SKILL.md missing frontmatter (no closing ---)")
+
+    name = ""
+    description = ""
+    frontmatter_lines = lines[1:end_idx]
+    i = 0
+    while i < len(frontmatter_lines):
+        line = frontmatter_lines[i]
+        if line.startswith("name:"):
+            name = line[len("name:"):].strip().strip('"').strip("'")
+        elif line.startswith("description:"):
+            value = line[len("description:"):].strip()
+            # Handle YAML multiline indicators (>, |, >-, |-)
+            if value in (">", "|", ">-", "|-"):
+                continuation_lines: list[str] = []
+                i += 1
+                while i < len(frontmatter_lines) and (frontmatter_lines[i].startswith("  ") or frontmatter_lines[i].startswith("\t")):
+                    continuation_lines.append(frontmatter_lines[i].strip())
+                    i += 1
+                description = " ".join(continuation_lines)
+                continue
+            else:
+                description = value.strip('"').strip("'")
+        i += 1
+
+    return name, description, content
diff --git a/.claude/skills/spring-core/SKILL.md b/.claude/skills/spring-core/SKILL.md
new file mode 100644
index 0000000..cb5e049
--- /dev/null
+++ b/.claude/skills/spring-core/SKILL.md
@@ -0,0 +1,105 @@
+---
+name: spring-core
+description: "Spring Boot + Java 백엔드 구현. API 추가, 엔티티, DTO, 서비스,
+  Repository, 예외 처리, 인증, Swagger 문서화. 'API 만들어줘', '엔티티 추가',
+  '새 도메인', 'DTO 작성', '에러 코드 추가', '컨트롤러 작성' 요청 시 트리거."
+---
+
+# Spring Core — Geumpumta 백엔드 구현 가이드
+
+Layered Architecture 기반. 모든 도메인이 동일한 구조와 패턴을 따른다.
+
+> 코드 템플릿이 필요하면 `references/templates.md`를 읽어라.
+> 예외 처리 접두사·규칙은 `references/exceptions.md`를 읽어라.
+
+---
+
+## 도메인 모듈 구조
+
+```
+{도메인명}/
+├── api/{도메인}Api.java           # Swagger 문서 인터페이스
+├── controller/{도메인}Controller.java  # HTTP 매핑만 (로직 없음)
+├── service/{도메인}Service.java        # 비즈니스 로직
+├── repository/{도메인}Repository.java  # JPA Repository
+├── domain/{도메인}.java               # 엔티티, Enum
+└── dto/                               # record 기반 DTO
+    ├── request/  (선택)
+    └── response/ (선택)
+```
+
+---
+
+## 핵심 패턴 요약
+
+| 레이어 | 패턴 |
+|--------|------|
+| **엔티티** | `BaseEntity` 상속, `@Getter`, `@NoArgsConstructor`, `@Builder`(생성자), `@Setter` 금지 |
+| **DTO** | Java `record`, `from()` 정적 팩토리, 요청에 Jakarta Validation |
+| **Repository** | `JpaRepository<Entity, Long>` 상속 |
+| **Service** | 클래스 `@Transactional(readOnly=true)`, 쓰기만 `@Transactional` |
+| **Controller** | `@AssignUserId` + `@PreAuthorize`, `ResponseUtil.createSuccessResponse()` |
+| **Swagger** | Api 인터페이스 분리, `@Parameter(hidden=true)` userId |
+| **예외** | `BusinessException(ExceptionType.XXX)`, GlobalExceptionHandler 자동 처리 |
+
+---
+
+## 인증/인가
+
+```java
+// 일반 사용자
+@AssignUserId
+@PreAuthorize("isAuthenticated() and hasRole('USER')")
+
+// 관리자 전용
+@AssignUserId
+@PreAuthorize("isAuthenticated() and hasRole('ADMIN')")
+
+// 선택적 userId
+@AssignUserId(required = false)
+
+// 공개 API — 어노테이션 생략, Security 설정에 permitAll() 추가
+```
+
+---
+
+## 응답 형식
+
+```java
+// 데이터 있음
+return ResponseEntity.ok(ResponseUtil.createSuccessResponse(data));
+// 데이터 없음 (DELETE 등)
+return ResponseEntity.ok(ResponseUtil.createSuccessResponse());
+```
+
+실패 응답은 `GlobalExceptionHandler`가 자동 처리한다.
+
+---
+
+## 새 도메인 추가 체크리스트
+
+1. 도메인 패키지 생성 (위 구조)
+2. 엔티티 — `BaseEntity` 상속, `@Builder` 생성자
+3. 에러 코드 — `ExceptionType` enum에 새 접두사 + 코드 (`references/exceptions.md` 참고)
+4. Repository — `JpaRepository` 상속
+5. DTO — `record`, `from()` 팩토리
+6. Service — `@Transactional(readOnly=true)`, `@RequiredArgsConstructor`
+7. Swagger Api 인터페이스
+8. Controller — Api 구현, `@AssignUserId` + `@PreAuthorize`
+9. 테스트 — 단위(`unit/`) + 통합(`integration/`)
+
+---
+
+## 절대 금지
+
+1. **클라이언트 타임스탬프 신뢰 금지** — 시간은 서버에서만 생성
+2. **`security/` 디렉토리 파일 커밋 금지** — git submodule
+3. **`StudySessionRepository` Native Query 수정 시 랭킹/통계 영향 확인**
+4. **시즌/캐시 코드 수정 시 `activeSeason` 캐시 eviction 확인**
+
+---
+
+## 테스트 패턴
+
+- **단위** (`src/test/.../unit/`): `BaseUnitTest` 상속, Mockito + AssertJ, 프로파일 `unit-test`
+- **통합** (`src/test/.../integration/`): `BaseIntegrationTest` 상속, TestContainers MySQL 8.0, 프로파일 `test`
diff --git a/.claude/skills/spring-core/references/exceptions.md b/.claude/skills/spring-core/references/exceptions.md
new file mode 100644
index 0000000..0e5f368
--- /dev/null
+++ b/.claude/skills/spring-core/references/exceptions.md
@@ -0,0 +1,65 @@
+# 예외 처리 가이드
+
+---
+
+## ExceptionType 접두사 규칙
+
+| 접두사 | 도메인 | 예시 |
+|--------|--------|------|
+| `C` | 공통 | `C001`, `C002`, `C003` |
+| `S` | 보안 | `S001` ~ `S006` |
+| `T` | 토큰 | `T001`, `T002` |
+| `U` | 사용자 | `U001` ~ `U006` |
+| `M` | 메일 | `M001` |
+| `ST` | 학습 | `ST001` ~ `ST003` |
+| `W` | WiFi | `W001` ~ `W003` |
+| `I` | 이미지 | `I001` ~ `I003` |
+| `B` | 게시판 | `B001` |
+| `SE` | 시즌 | `SE001` ~ `SE005` |
+| `F` | FCM | `F001` ~ `F003` |
+
+새 도메인이면 새 접두사를 정의하고, 기존 도메인이면 마지막 번호 다음을 사용한다.
+
+---
+
+## 에러 코드 추가 방법
+
+`ExceptionType.java`에 enum 상수를 추가한다:
+
+```java
+// ExceptionType.java — 해당 도메인 섹션 하단에 추가
+EXAMPLE_NOT_FOUND(NOT_FOUND, "EX001", "예시를 찾을 수 없습니다"),
+```
+
+각 상수는 `(HttpStatus, 코드문자열, 메시지)` 형태.
+
+---
+
+## 예외 던지기
+
+```java
+throw new BusinessException(ExceptionType.EXAMPLE_NOT_FOUND);
+```
+
+`GlobalExceptionHandler`가 `BusinessException`을 잡아 표준 에러 응답을 자동 반환하므로,
+별도의 try-catch나 핸들러 추가 불필요.
+
+---
+
+## 예외 처리 구조
+
+```
+BusinessException(ExceptionType)
+        ↓
+GlobalExceptionHandler (@RestControllerAdvice)
+        ↓
+ResponseEntity<ResponseBody<Void>> {
+    status: ExceptionType.getStatus(),
+    body: ResponseUtil.createFailureResponse(exceptionType)
+}
+```
+
+그 외 자동 처리되는 예외:
+- `MethodArgumentNotValidException` → `BINDING_ERROR` + 검증 메시지
+- `AuthorizationDeniedException` → `ACCESS_DENIED`
+- `Exception` (기타) → `UNEXPECTED_SERVER_ERROR`
diff --git a/.claude/skills/spring-core/references/templates.md b/.claude/skills/spring-core/references/templates.md
new file mode 100644
index 0000000..754f0d5
--- /dev/null
+++ b/.claude/skills/spring-core/references/templates.md
@@ -0,0 +1,230 @@
+# 코드 템플릿
+
+각 레이어별 코드 작성 시 이 템플릿을 참고한다. `{Example}`은 실제 도메인명으로 치환.
+
+---
+
+## 1. 엔티티
+
+```java
+package com.gpt.geumpumtabackend.{도메인}.domain;
+
+import com.gpt.geumpumtabackend.global.base.BaseEntity;
+import jakarta.persistence.*;
+import lombok.Builder;
+import lombok.Getter;
+import lombok.NoArgsConstructor;
+
+@Entity
+@Getter
+@NoArgsConstructor
+public class Example extends BaseEntity {
+
+    @Id
+    @GeneratedValue(strategy = GenerationType.IDENTITY)
+    private Long id;
+
+    private String name;
+
+    @Builder
+    public Example(String name) {
+        this.name = name;
+    }
+}
+```
+
+**규칙:**
+- `@Setter` 금지 — 상태 변경은 도메인 메서드
+- `@Builder`는 생성자에만 (클래스 레벨 금지)
+- Soft Delete: `@SQLDelete` + `@Where` (User 엔티티 참고)
+- 연관관계: `@ManyToOne(fetch = LAZY)` 기본
+
+---
+
+## 2. 요청 DTO
+
+```java
+public record ExampleRequest(
+    @NotBlank(message = "이름은 필수입니다")
+    String name
+) {}
+```
+
+---
+
+## 3. 응답 DTO
+
+```java
+public record ExampleResponse(
+    Long id,
+    String name,
+    LocalDateTime createdAt
+) {
+    public static ExampleResponse from(Example example) {
+        return new ExampleResponse(
+            example.getId(),
+            example.getName(),
+            example.getCreatedAt()
+        );
+    }
+}
+```
+
+**규칙:**
+- 엔티티만 받으면 `from()`, 추가 데이터가 필요하면 `of()`
+- 리스트용 / 상세 조회용 DTO 분리 (예: `BoardListResponse` vs `BoardResponse`)
+
+---
+
+## 4. Repository
+
+```java
+public interface ExampleRepository extends JpaRepository<Example, Long> {
+    List<Example> findTop10ByOrderByCreatedAtDesc();
+}
+```
+
+- 복잡한 쿼리: `@Query`(JPQL) 또는 Native Query
+- `StudySessionRepository` Native Query 수정 시 랭킹/통계 영향 확인
+
+---
+
+## 5. Service
+
+```java
+@Service
+@Transactional(readOnly = true)
+@RequiredArgsConstructor
+public class ExampleService {
+
+    private final ExampleRepository exampleRepository;
+    private final UserRepository userRepository;
+
+    public ExampleResponse getExample(Long userId, Long exampleId) {
+        userRepository.findById(userId)
+            .orElseThrow(() -> new BusinessException(ExceptionType.USER_NOT_FOUND));
+
+        Example example = exampleRepository.findById(exampleId)
+            .orElseThrow(() -> new BusinessException(ExceptionType.EXAMPLE_NOT_FOUND));
+
+        return ExampleResponse.from(example);
+    }
+
+    @Transactional
+    public ExampleResponse createExample(Long userId, ExampleRequest request) {
+        userRepository.findById(userId)
+            .orElseThrow(() -> new BusinessException(ExceptionType.USER_NOT_FOUND));
+
+        Example example = Example.builder()
+            .name(request.name())
+            .build();
+
+        return ExampleResponse.from(exampleRepository.save(example));
+    }
+}
+```
+
+---
+
+## 6. Controller
+
+```java
+@RestController
+@RequiredArgsConstructor
+@RequestMapping("/api/v1/example")
+public class ExampleController implements ExampleApi {
+
+    private final ExampleService exampleService;
+
+    @GetMapping("/{exampleId}")
+    @AssignUserId
+    @PreAuthorize("isAuthenticated() and hasRole('USER')")
+    public ResponseEntity<ResponseBody<ExampleResponse>> getExample(
+            Long userId,
+            @PathVariable Long exampleId
+    ) {
+        ExampleResponse response = exampleService.getExample(userId, exampleId);
+        return ResponseEntity.ok(ResponseUtil.createSuccessResponse(response));
+    }
+
+    @PostMapping
+    @AssignUserId
+    @PreAuthorize("isAuthenticated() and hasRole('USER')")
+    public ResponseEntity<ResponseBody<ExampleResponse>> createExample(
+            Long userId,
+            @RequestBody @Valid ExampleRequest request
+    ) {
+        ExampleResponse response = exampleService.createExample(userId, request);
+        return ResponseEntity.ok(ResponseUtil.createSuccessResponse(response));
+    }
+}
+```
+
+- URL 패턴: `/api/v1/{도메인명}`
+
+---
+
+## 7. Swagger API 인터페이스
+
+```java
+public interface ExampleApi {
+
+    @Operation(
+        summary = "예시 조회 API",
+        description = "USER 이상의 권한을 가진 사용자가 예시를 조회합니다."
+    )
+    @SwaggerApiResponses(
+        success = @SwaggerApiSuccessResponse(
+            response = ExampleResponse.class,
+            description = "예시 조회 성공"),
+        errors = {
+            @SwaggerApiFailedResponse(ExceptionType.NEED_AUTHORIZED),
+            @SwaggerApiFailedResponse(ExceptionType.USER_NOT_FOUND),
+            @SwaggerApiFailedResponse(ExceptionType.EXAMPLE_NOT_FOUND)
+        }
+    )
+    @GetMapping("/{exampleId}")
+    @AssignUserId
+    @PreAuthorize("isAuthenticated() and hasRole('USER')")
+    ResponseEntity<ResponseBody<ExampleResponse>> getExample(
+        @Parameter(hidden = true) Long userId,
+        @PathVariable Long exampleId
+    );
+}
+```
+
+- `@Parameter(hidden = true)` — userId는 Swagger에 노출하지 않음
+- `@SwaggerApiFailedResponse`에 발생 가능한 `ExceptionType` 모두 명시
+
+---
+
+## 8. 페이징 조회
+
+```java
+// Controller
+@GetMapping
+public ResponseEntity<ResponseBody<GlobalPageResponse<ExampleResponse>>> getList(
+    Long userId,
+    @RequestParam(defaultValue = "0") int page,
+    @RequestParam(defaultValue = "10") int size
+) { ... }
+
+// Service
+Page<Example> examples = exampleRepository.findAll(PageRequest.of(page, size));
+return GlobalPageResponse.from(examples.map(ExampleResponse::from));
+```
+
+---
+
+## 9. Enum 필드 / 연관관계
+
+```java
+// Enum
+@Enumerated(EnumType.STRING)
+private ExampleStatus status;
+
+// 연관관계
+@ManyToOne(fetch = FetchType.LAZY)
+@JoinColumn(name = "user_id")
+private User user;
+```
diff --git a/.claude/skills/spring-review/SKILL.md b/.claude/skills/spring-review/SKILL.md
new file mode 100644
index 0000000..a110cf3
--- /dev/null
+++ b/.claude/skills/spring-review/SKILL.md
@@ -0,0 +1,175 @@
+---
+name: spring-review
+description: >
+  Geumpumta Spring Boot 백엔드 코드 리뷰 스킬.
+  컨트롤러/서비스/레포지토리/엔티티/DTO 코드를 이 프로젝트의 아키텍처 규칙·보안·성능 기준으로 검토한다.
+  사용자가 "코드 리뷰", "리뷰해줘", "코드 점검", "PR 리뷰", "코드 검토", "문제없어?", "이 코드 봐줘",
+  "review", "check this code" 등을 요청할 때 반드시 이 스킬을 사용할 것.
+  새 기능 구현 완료 후 검토를 요청하거나, PR 머지 전 확인을 요청할 때도 자동으로 트리거할 것.
+---
+
+# Spring Review — Geumpumta 코드 리뷰 가이드
+
+코드를 건넸을 때 아래 항목 순서대로 검토하고, 발견된 문제를 **심각도(🔴 Critical / 🟡 Warning / 🔵 Suggestion)** 와 함께 보고한다.
+문제가 없는 항목은 건너뛰고, 발견된 것만 명시한다. 마지막에 총평을 한 줄로 작성한다.
+
+---
+
+## 1. 레이어 아키텍처 위반 🔴
+
+컨트롤러는 HTTP I/O만 담당한다. 다음이 있으면 즉시 지적:
+- Controller에서 Repository를 직접 주입/호출
+- Controller에서 비즈니스 로직 수행 (조건 분기, 계산 등)
+- Service에서 ResponseEntity 반환 또는 HTTP 상태 코드 조작
+- Service에서 다른 Service를 무분별하게 교차 호출 (순환 의존성 위험)
+
+---
+
+## 2. 인증·인가 누락 🔴
+
+인증이 필요한 엔드포인트에 다음이 있는지 확인:
+- `@PreAuthorize("isAuthenticated() and hasRole('USER')")` 누락
+- `@AssignUserId` 누락 (userId를 파라미터로 받는 메서드에 필수)
+- `userId`를 RequestParam/PathVariable로 클라이언트에서 직접 받는 경우 — **절대 금지**, JWT에서 추출해야 함
+
+```java
+// 올바른 패턴
+@PreAuthorize("isAuthenticated() and hasRole('USER')")
+@AssignUserId
+public ResponseEntity<...> endpoint(Long userId) { ... }
+
+// 잘못된 패턴 — userId를 클라이언트가 전달
+public ResponseEntity<...> endpoint(@RequestParam Long userId) { ... }
+```
+
+---
+
+## 3. 클라이언트 타임스탬프 신뢰 🔴
+
+**모든 시간은 서버에서 생성한다.** 다음 패턴을 찾아 반드시 지적:
+- `LocalDateTime` 또는 시간 관련 값을 Request DTO로 받는 경우
+- `@RequestParam LocalDateTime` 으로 시간을 파라미터로 받는 경우
+- 예외: `date` 파라미터 기반 과거 조회(랭킹/통계 이력 조회)는 허용
+
+---
+
+## 4. 예외 처리 패턴 🔴
+
+- `throw new RuntimeException(...)` 또는 `throw new Exception(...)` 직접 사용 금지
+  → `throw new BusinessException(ExceptionType.XXX)` 사용
+- 새 에러 코드가 필요하면 `ExceptionType` enum에 추가 (접두사: `C` `S` `T` `U` `M` `ST` `W` `I` `B` `SE` `F`)
+- `try-catch`에서 예외를 삼키는(swallow) 경우: 로깅 없이 빈 catch 블록
+
+```java
+// 잘못된 패턴
+throw new RuntimeException("유저 없음");
+catch (Exception e) { } // 예외 무시
+
+// 올바른 패턴
+throw new BusinessException(ExceptionType.USER_NOT_FOUND);
+```
+
+---
+
+## 5. 응답 형식 🟡
+
+- `ResponseEntity.ok(data)` 직접 반환 금지 → `ResponseUtil.createSuccessResponse(data)` 사용
+- 빈 응답은 `ResponseUtil.createSuccessResponse()` (인자 없음)
+- 실패 응답은 GlobalExceptionHandler가 처리하므로 Service/Controller에서 직접 만들지 않음
+
+```java
+// 올바른 패턴
+return ResponseEntity.ok(ResponseUtil.createSuccessResponse(data));
+return ResponseEntity.ok(ResponseUtil.createSuccessResponse());
+```
+
+---
+
+## 6. 트랜잭션 🟡
+
+- 데이터 변경(INSERT/UPDATE/DELETE) 메서드에 `@Transactional` 누락 여부
+- 읽기 전용 메서드에 `@Transactional(readOnly = true)` 누락 여부 (성능 최적화)
+- Controller에 `@Transactional` 선언 — Service 레이어로 이동해야 함
+- 트랜잭션 안에서 외부 API(FCM, Cloudinary 등) 호출 — 롤백 범위 문제 발생 가능
+
+---
+
+## 7. JPA·쿼리 🟡
+
+- N+1 문제: 루프 안에서 Repository 호출, 또는 `@ManyToOne` 지연 로딩을 루프에서 참조
+- Soft Delete: `BaseEntity`를 상속한 엔티티 삭제 시 `@SQLDelete` 미적용 또는 `deletedAt` 직접 조작
+- `StudySessionRepository` Native Query 수정 시 → 랭킹/통계 도메인 영향 확인 요구 코멘트 추가
+- `@Query`에서 `nativeQuery = true` 사용 시 MySQL 8+ 문법 의존성 명시
+
+---
+
+## 8. 입력 유효성 검사 🟡
+
+- Request DTO에 `@Valid` 사용하는 Controller 메서드에서 `@Valid` 누락
+- Request DTO 필드에 검증 어노테이션(`@NotNull`, `@NotBlank`, `@Size` 등) 누락
+- 빈 문자열과 null을 구분해야 하는 필드에 `@NotBlank` 대신 `@NotNull` 사용
+
+---
+
+## 9. 보안 🔴
+
+- SQL Injection: Native Query에서 `String` 파라미터를 문자열 연결로 조합 — `:param` 바인딩 사용
+- 민감 정보(비밀번호, 토큰, 키) 로그 출력
+- `security/` 디렉토리 파일 커밋 여부 확인 요청
+- 스택 트레이스를 클라이언트에 직접 노출하는 응답
+
+---
+
+## 10. 캐시·시즌 🟡
+
+- 시즌/캐시 관련 코드 수정 시 `activeSeason` 캐시 eviction 로직(`@CacheEvict`) 확인
+- `@Cacheable` 사용 시 캐시 키 충돌 가능성 (userId, 기간 등 구분자 포함 여부)
+
+---
+
+## 리뷰 출력 형식
+
+```
+## 코드 리뷰: {파일명 또는 기능명}
+
+### 🔴 Critical
+1. **[레이어 위반]** `UserController`에서 `UserRepository`를 직접 주입하고 있습니다.
+   → Service를 통해 데이터에 접근하세요.
+   ```java
+   // 현재 코드 (문제)
+   // 수정 예시
+   ```
+
+### 🟡 Warning
+1. **[트랜잭션 누락]** `createPost()` 메서드에 `@Transactional`이 없습니다.
+
+### 🔵 Suggestion
+1. **[가독성]** `buildResponse()` 헬퍼 메서드로 추출하면 가독성이 좋아집니다.
+
+---
+**총평**: Critical 1건, Warning 1건 발견. 레이어 위반 수정 후 머지 권장.
+```
+
+문제가 없으면:
+```
+**총평**: 아키텍처 규칙, 인증, 예외 처리, 응답 형식 모두 이상 없습니다. 머지 가능합니다.
+```
+
+---
+
+## 빠른 리뷰 체크리스트
+
+리뷰 시작 전 코드를 읽으며 아래를 빠르게 체크:
+
+| # | 항목 | 확인 |
+|---|------|------|
+| 1 | Controller → Service → Repository 흐름만 존재 | |
+| 2 | 인증 엔드포인트에 `@PreAuthorize` + `@AssignUserId` | |
+| 3 | userId를 클라이언트에서 받지 않음 | |
+| 4 | 모든 시간은 서버에서 생성 | |
+| 5 | `BusinessException(ExceptionType.XXX)` 사용 | |
+| 6 | `ResponseUtil.createSuccessResponse()` 사용 | |
+| 7 | 쓰기에 `@Transactional`, 읽기에 `readOnly = true` | |
+| 8 | Request DTO에 `@Valid` + 필드 검증 어노테이션 | |
+| 9 | N+1 쿼리 없음 | |
+| 10 | Native Query에 파라미터 바인딩(`:param`) 사용 | |
diff --git a/.claude/skills/spring-test/SKILL.md b/.claude/skills/spring-test/SKILL.md
new file mode 100644
index 0000000..884e542
--- /dev/null
+++ b/.claude/skills/spring-test/SKILL.md
@@ -0,0 +1,368 @@
+---
+name: spring-test
+description: >
+  Geumpumta Spring Boot 백엔드 테스트 코드 작성 스킬.
+  Service 단위 테스트(JUnit 5 + Mockito + AssertJ)와 Controller 통합 테스트(TestContainers + MockMvc)를 작성한다.
+  사용자가 "테스트 작성", "단위 테스트", "통합 테스트", "Service 테스트", "Controller 테스트", "테스트 코드",
+  "Unit Test", "Integration Test", "Test 추가해줘" 등을 요청할 때 반드시 이 스킬을 사용할 것.
+  새 도메인이나 기능 구현 후 테스트가 필요할 때도 자동으로 트리거할 것.
+---
+
+# Spring Test — Geumpumta 테스트 코드 작성 가이드
+
+---
+
+## 테스트 종류 선택
+
+| 요청 | 작성할 테스트 |
+|------|------------|
+| Service 로직 검증 | 단위 테스트 (`unit/`) |
+| API 엔드포인트 검증 | 통합 테스트 (`integration/`) |
+| 둘 다 요청 | 둘 다 작성 |
+
+---
+
+## 1. 단위 테스트 (Service)
+
+### 위치
+```
+src/test/java/com/gpt/geumpumtabackend/unit/{도메인}/service/
+└── {도메인}ServiceTest.java
+```
+
+### 클래스 구조
+
+`BaseUnitTest`를 **상속하지 않는다**. `@ExtendWith(MockitoExtension.class)`만 사용한다.
+(BaseUnitTest는 @SpringBootTest를 로드하므로 순수 Mockito 테스트엔 불필요하다.)
+
+```java
+package com.gpt.geumpumtabackend.unit.{도메인}.service;
+
+import com.gpt.geumpumtabackend.global.exception.BusinessException;
+import com.gpt.geumpumtabackend.global.exception.ExceptionType;
+// ... 필요한 import
+
+import org.junit.jupiter.api.*;
+import org.junit.jupiter.api.extension.ExtendWith;
+import org.mockito.InjectMocks;
+import org.mockito.Mock;
+import org.mockito.junit.jupiter.MockitoExtension;
+
+import java.util.Optional;
+
+import static org.assertj.core.api.Assertions.*;
+import static org.mockito.BDDMockito.*;
+import static org.mockito.Mockito.*;
+
+@ExtendWith(MockitoExtension.class)
+@DisplayName("{도메인}Service 단위 테스트")
+class {도메인}ServiceTest {
+
+    @Mock
+    private {의존성}Repository {의존성}Repository;
+
+    // 다른 @Mock 의존성...
+
+    @InjectMocks
+    private {도메인}Service {도메인}Service;
+
+    // @Nested 클래스로 기능별 그룹화
+}
+```
+
+### 테스트 메서드 패턴
+
+**정상 케이스:**
+```java
+@Nested
+@DisplayName("{기능명}")
+class {기능명} {
+
+    @Test
+    @DisplayName("{상황}일 때 {결과}가 반환된다")
+    void {상황}_결과반환() {
+        // Given
+        Long userId = 1L;
+        User testUser = createTestUser(userId, "김철수", Department.SOFTWARE);
+
+        given({mock}.{method}(any())).willReturn({value});
+
+        // When
+        {ResponseType} response = {service}.{method}({args});
+
+        // Then
+        assertThat(response).isNotNull();
+        assertThat(response.{field}()).isEqualTo({expected});
+        verify({mock}).{method}({matcher});
+    }
+}
+```
+
+**예외 케이스:**
+```java
+@Test
+@DisplayName("{상황}일 때 {ERROR_CODE} 예외가 발생한다")
+void {상황}_예외발생() {
+    // Given
+    given({mock}.{method}(any())).willReturn({errorValue});
+
+    // When & Then
+    assertThatThrownBy(() -> {service}.{method}({args}))
+        .isInstanceOf(BusinessException.class)
+        .hasFieldOrPropertyWithValue("exceptionType", ExceptionType.{ERROR_CODE});
+
+    verify({mock}, never()).{shouldNotCallMethod}(any());
+}
+```
+
+### 테스트 데이터 헬퍼
+
+엔티티의 `id` 필드는 JPA가 관리하므로 Reflection으로 설정한다:
+
+```java
+private User createTestUser(Long id, String name, Department department) {
+    User user = User.builder()
+            .name(name)
+            .email("test@kumoh.ac.kr")
+            .department(department)
+            .picture("test.jpg")
+            .role(UserRole.USER)
+            .provider(OAuth2Provider.GOOGLE)
+            .providerId("test-provider-id")
+            .build();
+    setId(user, id);
+    return user;
+}
+
+private void setId(Object entity, Long id) {
+    try {
+        java.lang.reflect.Field idField = entity.getClass().getDeclaredField("id");
+        idField.setAccessible(true);
+        idField.set(entity, id);
+    } catch (Exception e) {
+        throw new RuntimeException("Failed to set test entity ID", e);
+    }
+}
+```
+
+인터페이스 Projection(예: `DepartmentRankingTemp`)은 `mock()` + `given()`으로 처리한다:
+```java
+private DepartmentRankingTemp createMockProjection(String dept, Long millis, Long rank) {
+    DepartmentRankingTemp m = mock(DepartmentRankingTemp.class);
+    given(m.getDepartmentName()).willReturn(dept);
+    given(m.getTotalMillis()).willReturn(millis);
+    given(m.getRanking()).willReturn(rank);
+    return m;
+}
+```
+
+---
+
+## 2. 통합 테스트 (Controller)
+
+### 위치
+```
+src/test/java/com/gpt/geumpumtabackend/integration/{도메인}/controller/
+└── {도메인}ControllerIntegrationTest.java
+```
+
+### 클래스 구조
+
+반드시 `BaseIntegrationTest`를 상속한다. `@AfterEach`에서 자동으로 TRUNCATE + FLUSHALL 처리된다.
+
+```java
+package com.gpt.geumpumtabackend.integration.{도메인}.controller;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.gpt.geumpumtabackend.integration.config.BaseIntegrationTest;
+// ... 필요한 import
+
+import org.junit.jupiter.api.*;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.boot.test.autoconfigure.web.servlet.AutoConfigureMockMvc;
+import org.springframework.http.MediaType;
+import org.springframework.test.web.servlet.MockMvc;
+
+import static org.assertj.core.api.Assertions.*;
+import static org.hamcrest.Matchers.*;
+import static org.springframework.test.web.servlet.request.MockMvcRequestBuilders.*;
+import static org.springframework.test.web.servlet.result.MockMvcResultHandlers.*;
+import static org.springframework.test.web.servlet.result.MockMvcResultMatchers.*;
+
+@DisplayName("{도메인} Controller 통합 테스트")
+@AutoConfigureMockMvc
+class {도메인}ControllerIntegrationTest extends BaseIntegrationTest {
+
+    @Autowired
+    private MockMvc mockMvc;
+
+    @Autowired
+    private ObjectMapper objectMapper;
+
+    @Autowired
+    private JwtHandler jwtHandler;
+
+    @Autowired
+    private UserRepository userRepository;
+
+    // 테스트용 공유 데이터
+    private User testUser;
+    private String accessToken;
+
+    @BeforeEach
+    void setUp() {
+        testUser = createUser("테스트유저", "test@kumoh.ac.kr", Department.SOFTWARE);
+        accessToken = generateToken(testUser);
+    }
+
+    private User createUser(String name, String email, Department department) {
+        User user = User.builder()
+                .name(name).email(email).department(department)
+                .role(UserRole.USER).picture("profile.jpg")
+                .provider(OAuth2Provider.GOOGLE).providerId("provider-" + email)
+                .build();
+        return userRepository.save(user);
+    }
+
+    private String generateToken(User user) {
+        JwtUserClaim claim = new JwtUserClaim(user.getId(), UserRole.USER, false);
+        return jwtHandler.createTokens(claim).getAccessToken();
+    }
+}
+```
+
+### HTTP 요청/응답 검증 패턴
+
+**성공 케이스:**
+```java
+@Nested
+@DisplayName("{API명} API")
+class {API명}Api {
+
+    @Test
+    @DisplayName("정상 요청 시 {결과}를 반환한다")
+    void 정상요청_결과반환() throws Exception {
+        // Given
+        {RequestType} request = new {RequestType}({args});
+
+        // When & Then
+        mockMvc.perform(post("/api/v1/{path}")
+                        .header("Authorization", "Bearer " + accessToken)
+                        .contentType(MediaType.APPLICATION_JSON)
+                        .content(objectMapper.writeValueAsString(request)))
+                .andDo(print())
+                .andExpect(status().isOk())
+                .andExpect(jsonPath("$.success").value("true"))
+                .andExpect(jsonPath("$.data.{field}").exists());
+
+        // DB 직접 검증 (필요 시)
+        {Entity} saved = {repository}.findAll().get(0);
+        assertThat(saved.{field}()).isEqualTo({expected});
+    }
+
+    @Test
+    @DisplayName("인증 없이 요청하면 403 에러가 발생한다")
+    void 인증없음_403에러() throws Exception {
+        mockMvc.perform(get("/api/v1/{path}"))
+                .andExpect(status().isForbidden());
+    }
+}
+```
+
+**응답값을 다음 요청에 사용할 때:**
+```java
+String responseBody = mockMvc.perform(post("/api/v1/{path}")
+                .header("Authorization", "Bearer " + accessToken)
+                .contentType(MediaType.APPLICATION_JSON)
+                .content(objectMapper.writeValueAsString(request)))
+        .andExpect(status().isOk())
+        .andReturn()
+        .getResponse()
+        .getContentAsString();
+
+Long id = objectMapper.readTree(responseBody)
+        .get("data")
+        .get("{idField}")
+        .asLong();
+```
+
+### 전체 흐름 테스트 (필수 포함)
+
+시나리오 기반으로 여러 API를 순서대로 호출하는 흐름 테스트를 반드시 1개 포함한다:
+
+```java
+@Nested
+@DisplayName("Controller-Service-Repository 전체 흐름 테스트")
+class FullFlowTest {
+
+    @Test
+    @DisplayName("{시나리오} 전체 흐름이 정상 동작한다")
+    void 전체흐름_정상동작() throws Exception {
+        // 1단계: ...
+        // 2단계: ...
+        // 3단계: DB 최종 검증
+    }
+}
+```
+
+---
+
+## 3. 응답 구조
+
+모든 API는 `ResponseUtil`로 표준화된 형식을 사용한다:
+
+```json
+{
+  "success": "true",
+  "data": { ... }
+}
+```
+
+실패 시:
+```json
+{
+  "success": "false",
+  "error": {
+    "code": "ST001",
+    "message": "..."
+  }
+}
+```
+
+JSONPath 검증 예시:
+```java
+.andExpect(jsonPath("$.success").value("true"))
+.andExpect(jsonPath("$.data.{field}").exists())
+.andExpect(jsonPath("$.data.{field}").isNumber())
+.andExpect(jsonPath("$.data.{arrayField}").isArray())
+.andExpect(jsonPath("$.data.{arrayField}", hasSize(3)))
+.andExpect(jsonPath("$.data.{arrayField}[0].{subField}").value("expected"))
+```
+
+---
+
+## 4. @AssignUserId AOP 처리
+
+컨트롤러 메서드가 `@AssignUserId`를 사용하면 JWT에서 userId를 자동 주입한다.
+통합 테스트에서는 유효한 JWT 토큰을 `Authorization: Bearer {token}` 헤더로 전달하면 자동으로 처리된다.
+단위 테스트에서는 userId를 직접 파라미터로 전달한다.
+
+---
+
+## 5. 체크리스트
+
+단위 테스트 작성 후 확인:
+- [ ] `@ExtendWith(MockitoExtension.class)` 사용
+- [ ] `@Mock` / `@InjectMocks` 올바른 위치
+- [ ] Given-When-Then 구조
+- [ ] 정상 케이스 + 예외 케이스 모두 포함
+- [ ] `verify()`로 Mock 호출 검증
+- [ ] `@Nested` + `@DisplayName` 한글 설명
+
+통합 테스트 작성 후 확인:
+- [ ] `BaseIntegrationTest` 상속
+- [ ] `@AutoConfigureMockMvc` 선언
+- [ ] `@BeforeEach`에서 테스트 데이터 + JWT 토큰 설정
+- [ ] 인증 없는 요청 → 403 테스트 포함
+- [ ] 전체 흐름 테스트 1개 이상 포함
+- [ ] DB 직접 검증 (`repository.findAll()` 등)
diff --git a/.claude/skills/spring-test/evals/evals.json b/.claude/skills/spring-test/evals/evals.json
new file mode 100644
index 0000000..494fbf6
--- /dev/null
+++ b/.claude/skills/spring-test/evals/evals.json
@@ -0,0 +1,23 @@
+{
+  "skill_name": "spring-test",
+  "evals": [
+    {
+      "id": 0,
+      "prompt": "SeasonService 단위 테스트 작성해줘. 시즌 전환 로직(transitionSeason), 활성 시즌 조회(getActiveSeason), 그리고 활성 시즌이 없을 때 예외 발생하는 케이스 포함해줘.",
+      "expected_output": "@ExtendWith(MockitoExtension.class) 사용, @Mock SeasonRepository, @InjectMocks SeasonService, @Nested 그룹화, Given-When-Then 패턴, 정상 케이스 + BusinessException 예외 케이스, verify() 검증 포함된 SeasonServiceTest.java",
+      "files": []
+    },
+    {
+      "id": 1,
+      "prompt": "BoardController 통합 테스트 작성해줘. 게시글 목록 조회(/api/v1/boards), 게시글 상세 조회, 인증 없는 요청 403 케이스, 전체 흐름 테스트 포함.",
+      "expected_output": "BaseIntegrationTest 상속, @AutoConfigureMockMvc, MockMvc 사용, @BeforeEach JWT 토큰 셋업, jsonPath 응답 검증, 인증 없는 403 케이스, FullFlowTest @Nested 포함된 BoardControllerIntegrationTest.java",
+      "files": []
+    },
+    {
+      "id": 2,
+      "prompt": "UserService 단위 테스트 작성해줘. 이메일 인증 코드 발송(sendVerificationEmail), 이메일 인증 확인 및 USER 승격(verifyEmailAndUpgrade), 이미 인증된 이메일일 때 예외 케이스 포함.",
+      "expected_output": "@ExtendWith(MockitoExtension.class), @Mock UserRepository + RedisTemplate/EmailService 등 의존성, 정상 케이스 + 예외 케이스, assertThatThrownBy + ExceptionType 검증 포함된 UserServiceTest.java",
+      "files": []
+    }
+  ]
+}
diff --git a/.mcp.json b/.mcp.json
new file mode 100644
index 0000000..d1e548f
--- /dev/null
+++ b/.mcp.json
@@ -0,0 +1,18 @@
+{
+  "mcpServers": {
+    "context7": {
+      "command": "npx",
+      "args": [
+        "-y",
+        "@upstash/context7-mcp"
+      ]
+    },
+    "sequential-thinking": {
+      "command": "npx",
+      "args": [
+        "-y",
+        "@modelcontextprotocol/server-sequential-thinking"
+      ]
+    }
+  }
+}
diff --git a/CLAUDE.md b/CLAUDE.md
index 16fc832..aa684c1 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -1,247 +1,88 @@
 # CLAUDE.md
 
-이 문서는 Claude Code가 본 코드베이스에서 작업할 때 참고하는 온보딩 가이드다.
+**Geumpumta(열정품은타이머)** — 금오공과대학교 학생들이 캠퍼스 안에서 공부 시간을 측정·경쟁하는 모바일 앱 백엔드.
 
 ---
 
-## WHY — 프로젝트 존재 이유
+## 1. 절대 규칙
 
-**Geumpumta(열정품은타이머)** 는 금오공과대학교 학생들이 **캠퍼스 안에서** 공부 시간을 측정·경쟁하는 모바일 앱의 백엔드다.
-
-핵심 가치:
-- **장소 인증**: 캠퍼스 Wi-Fi에 접속한 상태에서만 공부 타이머 시작 가능 → 실제 등교를 보장
-- **서버 기반 시간 관리**: 클라이언트 타임스탬프를 **절대 신뢰하지 않음** → 모든 시간은 서버에서 생성
-- **경쟁·동기부여**: 개인/학과/시즌 랭킹으로 학습 동기 유발
-- **대학 인증**: OAuth2 소셜 로그인 + @kumoh.ac.kr 이메일 인증으로 재학생만 이용
-
----
-
-## WHAT — 시스템이 하는 일
-
-### 도메인 요약
-
-| 도메인 | 역할 | 핵심 엔티티 |
-|--------|------|-------------|
-| `study` | 학습 세션 시작/종료, 시간 계산 | `StudySession` |
-| `rank` | 개인/학과/시즌 랭킹 계산·저장 | `UserRanking`, `DepartmentRanking`, `Season`, `SeasonRankingSnapshot` |
-| `statistics` | 일간/주간/월간 통계, 잔디 차트 | (엔티티 없음, 쿼리 기반) |
-| `user` | 사용자 관리, 이메일 인증, 프로필 | `User` |
-| `token` | JWT 토큰 발급/갱신 | `RefreshToken` |
-| `board` | 공지사항 게시판 | `Board` |
-| `image` | Cloudinary 프로필 이미지 업로드 | (엔티티 없음) |
-| `wifi` | 캠퍼스 Wi-Fi 네트워크 검증 | (엔티티 없음, 설정 기반) |
-| `fcm` | Firebase 푸시 알림 | (토큰은 `User.fcmToken`에 저장) |
-
-### 전체 API 엔드포인트
-
-#### 학습 세션 (`/api/v1/study`)
-| Method | Path | 설명 | 권한 |
-|--------|------|------|------|
-| GET | `/api/v1/study` | 오늘의 총 공부 시간 + 진행중 여부 | USER |
-| POST | `/api/v1/study/start` | 세션 시작 (Wi-Fi 검증 필수) | USER |
-| POST | `/api/v1/study/end` | 세션 종료 | USER |
-
-#### 개인 랭킹 (`/api/v1/rank/personal`)
-| Method | Path | 설명 | 권한 |
-|--------|------|------|------|
-| GET | `/daily?date=` | 일간 개인 랭킹 (date 없으면 실시간) | USER |
-| GET | `/weekly?date=` | 주간 개인 랭킹 (월요일 기준) | USER |
-| GET | `/monthly?date=` | 월간 개인 랭킹 (1일 기준) | USER |
-
-#### 학과 랭킹 (`/api/v1/rank/department`)
-| Method | Path | 설명 | 권한 |
-|--------|------|------|------|
-| GET | `/daily?date=` | 일간 학과 랭킹 | USER |
-| GET | `/weekly?date=` | 주간 학과 랭킹 | USER |
-| GET | `/monthly?date=` | 월간 학과 랭킹 | USER |
-
-#### 시즌 랭킹 (`/api/v1/rank/season`)
-| Method | Path | 설명 | 권한 |
-|--------|------|------|------|
-| GET | `/current` | 현재 시즌 전체 랭킹 | USER |
-| GET | `/current/department?department=` | 현재 시즌 학과별 랭킹 | USER |
-| GET | `/{seasonId}` | 종료된 시즌 전체 랭킹 (스냅샷) | USER |
-| GET | `/{seasonId}/department?department=` | 종료된 시즌 학과별 랭킹 | USER |
-
-#### 통계 (`/api/v1/statistics`)
-| Method | Path | 설명 | 권한 |
-|--------|------|------|------|
-| GET | `/day?date=&targetUserId=` | 일간 통계 (2시간 슬롯) | USER |
-| GET | `/week?date=&targetUserId=` | 주간 통계 | USER |
-| GET | `/month?date=&targetUserId=` | 월간 통계 | USER |
-| GET | `/grass?date=&targetUserId=` | 잔디 차트 | USER |
-
-#### 사용자 (`/api/v1/user`)
-| Method | Path | 설명 | 권한 |
-|--------|------|------|------|
-| POST | `/complete-registration` | 회원가입 완료 (학교 이메일, 학과, 학번) | GUEST |
-| GET | `/profile` | 프로필 조회 | USER |
-| GET | `/nickname/verify?nickname=` | 닉네임 중복 확인 | USER |
-| POST | `/profile` | 프로필 수정 | USER |
-| DELETE | `/logout` | 로그아웃 | USER |
-| DELETE | `/withdraw` | 회원 탈퇴 (soft delete) | USER |
-| POST | `/restore` | 탈퇴 복구 | USER |
-
-#### 이메일 (`/api/v1/email`)
-| Method | Path | 설명 | 권한 |
-|--------|------|------|------|
-| POST | `/request-code` | 학교 이메일 인증 코드 발송 | GUEST |
-| POST | `/verify-code` | 인증 코드 검증 | GUEST |
-
-#### 토큰 (`/api/v1/token`)
-| Method | Path | 설명 | 권한 |
-|--------|------|------|------|
-| POST | `/refresh` | 액세스/리프레시 토큰 갱신 | 없음 |
-
-#### 게시판 (`/api/v1/board`)
-| Method | Path | 설명 | 권한 |
-|--------|------|------|------|
-| GET | `/list` | 공지 목록 | USER |
-| GET | `/{boardId}` | 공지 상세 | USER |
-| POST | `/` | 공지 작성 | ADMIN |
-| DELETE | `/{boardId}` | 공지 삭제 | ADMIN |
-
-#### 이미지 (`/api/v1/image`)
-| Method | Path | 설명 | 권한 |
-|--------|------|------|------|
-| POST | `/profile` | 프로필 이미지 업로드 (최대 10MB) | USER |
-
-#### FCM (`/api/v1/fcm`)
-| Method | Path | 설명 | 권한 |
-|--------|------|------|------|
-| POST | `/register` | FCM 디바이스 토큰 등록 | USER |
-| DELETE | `/token` | FCM 토큰 삭제 | USER |
-
-### 에러 코드 체계 (`ExceptionType`)
-
-| 접두사 | 도메인 | 예시 |
-|--------|--------|------|
-| C | 공통 | `C001` UNEXPECTED_SERVER_ERROR, `C002` BINDING_ERROR |
-| S | 보안 | `S001`~`S006` OAuth/JWT 관련 |
-| T | 토큰 | `T001` REFRESH_TOKEN_NOT_EXIST, `T002` TOKEN_NOT_MATCHED |
-| U | 사용자 | `U001`~`U006` 사용자/학교이메일/학번 관련 |
-| M | 메일 | `M001` CANT_SEND_MAIL |
-| ST | 학습 | `ST001`~`ST003` 세션 미발견/중복/시간 오류 |
-| W | WiFi | `W001`~`W003` 네트워크 검증 |
-| I | 이미지 | `I001`~`I003` 파일/크기/업로드 |
-| B | 게시판 | `B001` BOARD_NOT_FOUND |
-| SE | 시즌 | `SE001`~`SE005` 시즌 관련 |
-| F | FCM | `F001`~`F003` 푸시 알림 |
-
-### 스케줄러
-
-| 작업 | Cron | 설명 |
-|------|------|------|
-| 일간 랭킹 확정 | `5 0 0 * * *` | 매일 00:00:05 — 전일 랭킹 계산·저장 |
-| 주간 랭킹 확정 | `0 1 0 ? * MON` | 매주 월요일 00:01 |
-| 월간 랭킹 확정 | `0 2 0 1 * ?` | 매월 1일 00:02 |
-| 시즌 전환 확인 | `0 5 0 * * *` | 매일 00:05 — 시즌 종료 시 스냅샷 생성 + 다음 시즌 시작 |
-| 최대 공부시간 체크 | `0 */10 * * * *` | 10분마다 — 3시간 초과 세션 자동 종료 + FCM 알림 |
-| 만료 리프레시 토큰 삭제 | 별도 스케줄러 | 만료된 RefreshToken 정리 |
+- **클라이언트 타임스탬프 절대 신뢰 금지** — 모든 시간은 서버에서 생성. 클라이언트 시간 파라미터 추가 금지
+- **`security/` 디렉토리 파일 커밋 금지** — git submodule로 관리되는 민감 설정
+- **`StudySessionRepository` Native Query 수정 시 랭킹/통계 도메인 영향 반드시 확인**
+- **시즌/캐시 관련 코드 수정 시 `activeSeason` 캐시 eviction 로직 확인**
 
 ---
 
-## HOW — 기술 구현
+## 2. 아키텍처
 
 ### 기술 스택
-
-| 구분 | 기술 |
-|------|------|
-| 언어/프레임워크 | Java 21, Spring Boot 3.5.6, Gradle |
-| 인증 | Spring Security + OAuth2 Client (Kakao, Google, Apple) |
-| JWT | JJWT 0.12.6 + Nimbus JOSE JWT 9.37.4 (Apple용) |
-| 데이터베이스 | Spring Data JPA + MySQL 8 |
-| 캐시 | Spring Data Redis + Caffeine (로컬 캐시) |
-| API 문서 | SpringDoc OpenAPI 2.8.14 |
-| 이미지 | Cloudinary 1.39.0 |
-| 네트워크 | Apache Commons Net 3.11.1 (CIDR 검증) |
-| 재시도 | Spring Retry + Spring Aspects |
-| 푸시 알림 | Firebase Admin SDK 9.7.1 |
-| 모니터링 | Micrometer + Prometheus |
-| 테스트 | JUnit 5, Mockito, TestContainers (MySQL 8.0, Redis 7.0) |
-| 기타 | Lombok |
+Java 21 · Spring Boot 3.5.6 · Gradle · Spring Security + OAuth2 (Kakao/Google/Apple) · JPA + MySQL 8 · Redis + Caffeine · TestContainers · Firebase Admin SDK · Cloudinary · Spring Retry · Lombok
 
 ### 프로젝트 구조
-
 ```
 src/main/java/com/gpt/geumpumtabackend/
-├── GeumpumtaBackendApplication.java
-│
-├── global/                    # 공통 인프라
-│   ├── aop/                   # @AssignUserId — JWT에서 userId 자동 주입
-│   ├── base/                  # BaseEntity (createdAt, updatedAt, deletedAt)
-│   ├── config/                # cache, fcm, image, mail, redis, retry, security, swagger
-│   ├── exception/             # GlobalExceptionHandler, BusinessException, ExceptionType
-│   ├── jwt/                   # JwtHandler, TokenProvider, JwtAuthenticationFilter
-│   ├── oauth/                 # OAuth2 (Kakao/Google/Apple), handlers, resolvers
-│   ├── response/              # ResponseUtil, ResponseBody, GlobalPageResponse
-│   └── scheduler/             # RefreshTokenDeleteScheduler
-│
-├── board/                     # 게시판 (CRUD, soft delete)
-├── fcm/                       # FCM 푸시 알림 (토큰 등록/삭제, 메시지 발송)
-├── image/                     # 이미지 업로드 (Cloudinary)
-├── rank/                      # 랭킹 시스템 (개인/학과/시즌, 스케줄러)
-├── statistics/                # 통계 (일간/주간/월간, 잔디 차트)
-├── study/                     # 학습 세션 (시작/종료, Wi-Fi 검증, 최대 시간 스케줄러)
-├── token/                     # JWT 토큰 관리 (발급/갱신)
-├── user/                      # 사용자 관리, 이메일 인증, 프로필
-└── wifi/                      # 캠퍼스 Wi-Fi 검증 (CIDR, 캐싱)
+├── global/           # 공통: AOP(@AssignUserId), BaseEntity, config, exception, jwt, oauth, response, scheduler
+├── board/            # 게시판 CRUD
+├── fcm/              # FCM 푸시 알림
+├── image/            # Cloudinary 이미지 업로드
+├── rank/             # 개인/학과/시즌 랭킹
+├── statistics/       # 일간/주간/월간 통계, 잔디 차트
+├── study/            # 학습 세션 (시작/종료, Wi-Fi 검증)
+├── token/            # JWT 토큰 관리
+├── user/             # 사용자 관리, 이메일 인증
+└── wifi/             # 캠퍼스 Wi-Fi CIDR 검증
 ```
 
-각 도메인 모듈은 `api/ → controller/ → service/ → repository/ → domain/ → dto/` 계층 구조를 따른다.
+각 도메인: `api/ → controller/ → service/ → repository/ → domain/ → dto/`
 
-### 아키텍처 패턴
+### 핵심 패턴
 
-#### Layered Architecture
-`Controller → Service → Repository → Entity`. Controller는 HTTP 처리만 담당하고, Service에 비즈니스 로직을 집중한다.
+**Layered Architecture**: Controller(HTTP만) → Service(비즈니스 로직) → Repository → Entity
 
-#### AOP 사용자 컨텍스트 주입
+**AOP 사용자 주입**:
 ```java
 @PreAuthorize("isAuthenticated() and hasRole('USER')")
 @AssignUserId  // JWT에서 userId 자동 주입
 public ResponseEntity<T> endpoint(Long userId) { ... }
 ```
 
-#### 표준 응답 형식
-```java
-ResponseUtil.createSuccessResponse(data);
-ResponseUtil.createFailureResponse(ExceptionType.ERROR_TYPE);
-```
+**표준 응답**: `ResponseUtil.createSuccessResponse(data)` / `ResponseUtil.createFailureResponse(ExceptionType.XXX)`
 
-#### 예외 처리
-- `GlobalExceptionHandler`(`@RestControllerAdvice`)에서 전역 처리
-- `ExceptionType` enum으로 에러 코드/메시지/HTTP 상태 관리
-- 도메인 예외는 `BusinessException` 상속
+**예외 처리**: `GlobalExceptionHandler`(@RestControllerAdvice) + `ExceptionType` enum(코드/메시지/HTTP상태) + `BusinessException` 상속
 
-#### Soft Delete
-모든 엔티티가 `BaseEntity` 상속 → `createdAt`, `updatedAt`, `deletedAt` 자동 관리. `User`는 `@SQLDelete`로 삭제 시 데이터 마스킹 처리.
+**Soft Delete**: `BaseEntity`(createdAt, updatedAt, deletedAt) 상속. User는 `@SQLDelete`로 마스킹
 
-#### 이중 랭킹 구조
-- **실시간 랭킹**: `StudySessionRepository` Native Query로 현재 기간 직접 계산 (진행중 세션 포함)
-- **확정 랭킹**: 기간 종료 후 `UserRanking`/`DepartmentRanking` 테이블에 저장
-- 컨트롤러에서 `date` 파라미터 유무로 분기
+**이중 랭킹**:
+- 실시간: `StudySessionRepository` Native Query로 직접 계산 (진행중 세션 포함)
+- 확정: 기간 종료 후 `UserRanking`/`DepartmentRanking` 테이블 저장
+- `date` 파라미터 유무로 분기
 
-#### 시즌 시스템
-4개 시즌 순환: `SPRING_SEMESTER`(3~6월) → `SUMMER_VACATION`(7~8월) → `FALL_SEMESTER`(9~12월) → `WINTER_VACATION`(1~2월). 시즌 종료 시 `SeasonRankingSnapshot`으로 불변 이력 저장 (Spring Retry 3회, 5초 backoff, JDBC 배치 2000건 청크).
+**시즌**: 4개 순환 (`SPRING_SEMESTER` 3~6 → `SUMMER_VACATION` 7~8 → `FALL_SEMESTER` 9~12 → `WINTER_VACATION` 1~2). 종료 시 `SeasonRankingSnapshot` 불변 저장 (Retry 3회, JDBC 배치 2000건)
 
-#### 학과 랭킹 계산
-학과별 상위 30명의 공부 시간 합산. Native Query + CTE로 25개 학과 처리. MySQL 8+ 필수.
+**학과 랭킹**: 상위 30명 합산. Native Query + CTE. MySQL 8+ 필수
 
-### 인증 플로우
+**인증 플로우**: OAuth2 → GUEST 생성 → @kumoh.ac.kr 이메일 인증 → 학과/학번 → USER 승격 → JWT(Access+Refresh, 14일)
+역할 계층: `ADMIN` ⊃ `USER` ⊃ `GUEST`
 
-```
-OAuth2 로그인 (Kakao/Google/Apple)
-  → GUEST 역할로 User 생성
-  → 학교 이메일 인증 (@kumoh.ac.kr)
-  → 학과/학번 입력 → USER 역할로 승격
-  → JWT 발급 (Access + Refresh, 14일)
-  → API 요청 시 Authorization: Bearer <token> 헤더
-  → JwtAuthenticationFilter → @PreAuthorize → @AssignUserId
-```
+### 에러 코드 접두사
+`C`(공통) · `S`(보안) · `T`(토큰) · `U`(사용자) · `M`(메일) · `ST`(학습) · `W`(WiFi) · `I`(이미지) · `B`(게시판) · `SE`(시즌) · `F`(FCM)
 
-역할 계층: `ADMIN` ⊃ `USER` ⊃ `GUEST`
+### 스케줄러
+- 일간 랭킹 `5 0 0 * * *` / 주간 `0 1 0 ? * MON` / 월간 `0 2 0 1 * ?`
+- 시즌 전환 `0 5 0 * * *` — 종료 시 스냅샷 + 다음 시즌 시작
+- 최대 공부시간 `0 */10 * * * *` — 3시간 초과 세션 자동 종료 + FCM
 
-### 프로파일 설정
+---
+
+## 3. 빌드 & 테스트
+
+```bash
+docker-compose up -d                                    # MySQL 8.4(3311) + Redis(6379)
+./gradlew clean build                                   # 빌드
+./gradlew bootRun --args='--spring.profiles.active=local' # 로컬 실행
+./gradlew test                                          # 전체 테스트
+./gradlew test --tests "ClassName"                      # 단일 클래스
+```
 
 | Profile | DB | DDL | 용도 |
 |---------|-----|-----|------|
@@ -251,66 +92,35 @@ OAuth2 로그인 (Kakao/Google/Apple)
 | `test` | TestContainers MySQL 8.0 | - | 통합 테스트 |
 | `unit-test` | H2 | - | 단위 테스트 |
 
-### 민감 설정 (Git Submodule)
-`src/main/resources/security/` 디렉토리에 git submodule로 관리. **절대 직접 커밋 금지.**
-- `application-database.yml`, `application-security.yml`, `application-mail.yml`
-- `application-swagger.yml`, `application-wifi.yml`, `application-cloudinary.yml`
-
-### CI/CD
-
-**GitHub Actions:**
-- **CI** (dev/prod): Java 21 빌드, TestContainers + Redis 서비스 연동, 테스트, JUnit 리포트
-- **CD** (dev): Docker 멀티 아키텍처 빌드 (AMD64/ARM64) → GHCR 푸시 → self-hosted 배포
-- **CD** (prod): 아티팩트 기반 Docker 빌드 → GHCR → production 환경 배포
-
-**Docker:**
-- Base image: `amd64/openjdk:21-jdk-slim`
-- `docker-compose.yml`: MySQL 8.4.0 (포트 3311) + Redis Alpine (포트 6379)
+**단위 테스트** (`src/test/.../unit/`): JUnit 5 + Mockito + AssertJ, `BaseUnitTest`, 프로파일 `unit-test`
+**통합 테스트** (`src/test/.../integration/`): TestContainers, `BaseIntegrationTest`, 프로파일 `test`, 테스트 후 TRUNCATE + FLUSHALL
 
 ---
 
-## Build & Run
-
-```bash
-# 인프라 실행 (MySQL 8.4, Redis)
-docker-compose up -d
-
-# 빌드
-./gradlew clean build
+## 4. 도메인 컨텍스트
 
-# 로컬 실행
-./gradlew bootRun --args='--spring.profiles.active=local'
-
-# 전체 테스트
-./gradlew test
-
-# 단일 테스트 클래스
-./gradlew test --tests "ClassName"
-```
-
-## 테스트
-
-### 단위 테스트 (`src/test/java/.../unit/`)
-- JUnit 5 + Mockito + AssertJ, `BaseUnitTest` 기반
-- 프로파일: `unit-test` (H2, Redis 비활성)
-- 대상: StudySession 시간 계산, 랭킹 로직, Wi-Fi 검증, 시즌 서비스, 시즌 스냅샷 재시도
+| 도메인 | 역할 | 핵심 엔티티 |
+|--------|------|-------------|
+| `study` | 학습 세션 시작/종료, 시간 계산 | `StudySession` |
+| `rank` | 개인/학과/시즌 랭킹 | `UserRanking`, `DepartmentRanking`, `Season`, `SeasonRankingSnapshot` |
+| `statistics` | 일/주/월 통계, 잔디 차트 | 쿼리 기반 |
+| `user` | 사용자, 이메일 인증, 프로필 | `User` |
+| `token` | JWT 발급/갱신 | `RefreshToken` |
+| `board` | 공지사항 | `Board` |
+| `wifi` | 캠퍼스 Wi-Fi 검증 | 설정 기반 |
+| `fcm` | 푸시 알림 | `User.fcmToken` |
 
-### 통합 테스트 (`src/test/java/.../integration/`)
-- TestContainers MySQL 8.0 + Redis 7.0, `BaseIntegrationTest` 기반
-- 프로파일: `test`
-- 각 테스트 후 전체 테이블 TRUNCATE + Redis FLUSHALL
-- 대상: Controller E2E 테스트
+핵심 가치: **장소 인증**(캠퍼스 Wi-Fi 필수) · **서버 시간 관리** · **경쟁/동기부여**(랭킹) · **대학 인증**(@kumoh.ac.kr)
 
 ---
 
-## Development Checklist
+## 5. 코딩 컨벤션
 
 1. 도메인 모듈 구조 준수: `api/ → controller/ → service/ → repository/ → domain/ → dto/`
-2. 인증 필요 엔드포인트에 `@AssignUserId` + `@PreAuthorize` 사용
+2. 인증 엔드포인트에 `@AssignUserId` + `@PreAuthorize` 사용
 3. `@Transactional` 적절히 적용 (읽기 전용은 `readOnly = true`)
 4. `ResponseUtil`로 응답 표준화
 5. 새 에러는 `ExceptionType` enum에 추가 (접두사 규칙 준수)
-6. `security/` 디렉토리 파일 커밋 금지
-7. `StudySessionRepository` Native Query 수정 시 랭킹/통계 도메인 영향 확인
-8. 시간은 반드시 서버에서 생성 — 클라이언트 타임스탬프 파라미터 추가 금지
-9. 시즌/캐시 관련 코드 수정 시 `activeSeason` 캐시 eviction 로직 확인
+
+### 민감 설정 (Git Submodule)
+`src/main/resources/security/`: `application-{database,security,mail,swagger,wifi,cloudinary}.yml`
diff --git a/src/main/java/com/gpt/geumpumtabackend/fcm/CLAUDE.md b/src/main/java/com/gpt/geumpumtabackend/fcm/CLAUDE.md
new file mode 100644
index 0000000..04dbc51
--- /dev/null
+++ b/src/main/java/com/gpt/geumpumtabackend/fcm/CLAUDE.md
@@ -0,0 +1,116 @@
+# FCM Domain
+
+Firebase Cloud Messaging 기반 푸시 알림 토큰 관리 및 알림 발송을 담당하는 도메인.
+
+---
+
+## 1. 절대 규칙
+
+- **FCM 알림 실패가 핵심 비즈니스 로직을 중단시키면 안 됨** — 알림 발송은 항상 try-catch로 감싸고 예외를 로깅만 할 것
+- **토큰 중복 금지** — 하나의 FCM 토큰은 하나의 사용자에게만 바인딩. 등록 시 기존 보유자에서 제거 후 할당
+- **Firebase 서비스 계정 JSON은 git submodule(`security/`)로 관리** — 직접 커밋 금지
+- **`PermanentFcmException`에 해당하는 에러는 재시도하지 않음** — UNREGISTERED, INVALID_ARGUMENT, SENDER_ID_MISMATCH, THIRD_PARTY_AUTH_ERROR
+
+---
+
+## 2. 아키텍처
+
+### 파일 구조
+```
+fcm/
+├── api/
+│   └── FcmApi.java                # Swagger 문서
+├── controller/
+│   └── FcmController.java        # POST /token (등록), DELETE /token (삭제)
+├── dto/
+│   ├── FcmMessageDto.java         # 메시지 DTO (token, title, body, imageUrl, data)
+│   └── request/
+│       └── FcmTokenRequest.java   # 토큰 등록 요청 (@NotBlank fcmToken)
+├── exception/
+│   └── PermanentFcmException.java # 재시도 불가 Firebase 에러 래퍼
+└── service/
+    ├── FcmService.java            # 토큰 관리 + 알림 오케스트레이션
+    └── FcmMessageSender.java      # Firebase 메시지 발송 (@Retryable)
+
+# 관련 설정
+global/config/fcm/
+├── FcmConfig.java                 # FirebaseApp 초기화 (@Profile("!test"))
+└── FcmProperties.java             # firebase.serviceAccountPath, firebase.projectId
+```
+
+### 토큰 저장
+- `User.fcmToken` (VARCHAR 255, nullable) — 별도 엔티티 없이 User에 직접 저장
+- 1 User : 1 Token (단일 디바이스 바인딩)
+
+### 토큰 생명주기
+| 이벤트 | 동작 |
+|--------|------|
+| `POST /api/v1/fcm/token` | 기존 보유자에서 제거 → 새 사용자에 할당 |
+| `DELETE /api/v1/fcm/token` | 토큰 null 처리 |
+| 로그아웃 (`UserService.logout`) | `fcmService.removeFcmToken()` 호출 |
+| 회원 탈퇴 (`UserService.withdrawUser`) | `fcmService.removeFcmToken()` + `@SQLDelete`로 fcm_token=NULL |
+| Firebase `UNREGISTERED` 응답 | 로그 경고 (자동 삭제는 미구현) |
+
+### 재시도 전략 (`FcmMessageSender.send`)
+```
+@Retryable(maxAttempts = 3, backoff = @Backoff(delay = 1000, multiplier = 2))
+```
+- **재시도 대상**: UNAVAILABLE, INTERNAL, QUOTA_EXCEEDED
+- **재시도 불가** (`PermanentFcmException`): UNREGISTERED, INVALID_ARGUMENT, SENDER_ID_MISMATCH, THIRD_PARTY_AUTH_ERROR
+- **복구**: 3회 실패 시 `@Recover` → `BusinessException(FCM_SEND_FAILED)` throw
+
+### 알림 트리거 (현재 1건)
+
+**최대 집중 시간 초과 알림**:
+1. `MaxFocusStudyScheduler` (`@Scheduled(fixedRate = 1000)`) — 3시간 초과 세션 감지
+2. `StudySessionService.endExpiredMaxFocusSessions()` — 세션 강제 종료
+3. `FcmService.sendMaxFocusNotification(user, hours)` — 푸시 발송
+   - title: "최대 집중 시간 도달"
+   - body: "{hours}시간 동안 열심히 공부하셨습니다! 잠시 휴식을 취해보세요."
+   - data: `{ type: "STUDY_SESSION_FORCE_ENDED", maxFocusHours: "{hours}" }`
+   - 토큰 없으면 silent return, 예외 발생해도 catch 후 로깅만
+
+---
+
+## 3. 빌드 & 테스트
+
+### 테스트 설정
+- `TestFcmConfig` (`@Profile("test")`) — 더미 credentials로 mock FirebaseApp 생성
+- `FcmConfig`는 `@Profile("!test")`로 테스트 시 비활성화
+- FCM 전용 단위/통합 테스트 파일은 현재 없음 — study 도메인 통합 테스트에서 간접 검증
+
+---
+
+## 4. 도메인 컨텍스트
+
+### 타 도메인 의존 관계
+```
+study (MaxFocusStudyScheduler) → FcmService.sendMaxFocusNotification()
+user  (UserService.logout/withdraw) → FcmService.removeFcmToken()
+user  (UserRepository.findByFcmToken) → 토큰 중복 체크
+```
+
+### 에러 코드
+| 코드 | 이름 | HTTP | 설명 |
+|------|------|------|------|
+| F001 | FCM_SEND_FAILED | 500 | 푸시 알림 전송 실패 (3회 재시도 후) |
+| F002 | FCM_INVALID_TOKEN | 400 | 빈/유효하지 않은 FCM 토큰 |
+| F003 | FCM_TOKEN_NOT_FOUND | 404 | 등록된 FCM 토큰 없음 |
+
+### API 엔드포인트 (`/api/v1/fcm`)
+| Method | Path | 설명 | 권한 |
+|--------|------|------|------|
+| POST | `/token` | FCM 디바이스 토큰 등록 | USER |
+| DELETE | `/token` | FCM 토큰 삭제 | USER |
+
+모든 엔드포인트: `@PreAuthorize(USER)` + `@AssignUserId`
+
+---
+
+## 5. 코딩 컨벤션
+
+1. 새 알림 타입 추가 시 `FcmService`에 전용 메서드 생성 (예: `sendMaxFocusNotification` 패턴)
+2. 알림 data 필드의 `type` 키로 클라이언트 측 분기 — 새 타입 추가 시 클라이언트 팀과 협의
+3. 재시도 불가 에러 추가 시 `FcmMessageSender.send()`의 `isPermanentError()` 분기에 추가
+4. 응답은 `ResponseUtil.createSuccessResponse()` 표준 형식
+5. 알림 발송은 핵심 로직과 분리 — 발송 실패가 트랜잭션을 롤백시키지 않도록 설계
diff --git a/src/main/java/com/gpt/geumpumtabackend/fcm/exception/PermanentFcmException.java b/src/main/java/com/gpt/geumpumtabackend/fcm/exception/PermanentFcmException.java
new file mode 100644
index 0000000..655a50e
--- /dev/null
+++ b/src/main/java/com/gpt/geumpumtabackend/fcm/exception/PermanentFcmException.java
@@ -0,0 +1,9 @@
+package com.gpt.geumpumtabackend.fcm.exception;
+
+import com.google.firebase.messaging.FirebaseMessagingException;
+
+public class PermanentFcmException extends RuntimeException {
+    public PermanentFcmException(FirebaseMessagingException cause) {
+        super(cause);
+    }
+}
diff --git a/src/main/java/com/gpt/geumpumtabackend/rank/CLAUDE.md b/src/main/java/com/gpt/geumpumtabackend/rank/CLAUDE.md
index b04ea8c..c2ba29f 100644
--- a/src/main/java/com/gpt/geumpumtabackend/rank/CLAUDE.md
+++ b/src/main/java/com/gpt/geumpumtabackend/rank/CLAUDE.md
@@ -1,128 +1,141 @@
-# Rank Domain CLAUDE.md
+# Rank Domain
 
-## 개요
+개인/학과/시즌 랭킹 계산·저장·조회를 담당하는 도메인.
 
-개인/학과 랭킹 및 시즌 시스템을 담당하는 도메인. 실시간 랭킹 계산, 확정 랭킹 저장, 시즌 전환, 스냅샷 생성을 포함한다.
+---
 
-## 파일 구조
+## 1. 절대 규칙
 
+- **`StudySessionRepository` Native Query 수정 시 랭킹/통계 도메인 영향 반드시 확인** — 실시간 랭킹이 이 쿼리에 직접 의존
+- **`activeSeason` 캐시 eviction은 시즌 전환 전에 실행** — 순서 뒤바뀌면 stale 캐시로 잘못된 시즌 참조
+- **스냅샷은 생성 후 절대 수정 금지** — `SeasonRankingSnapshot`은 불변 이력 레코드
+- **`DepartmentRankingRepository` CTE 쿼리는 MySQL 8+ 전용** — H2에서 동작하지 않음
+- **`SeasonSnapshotBatchService`는 JPA가 아닌 JDBC 직접 사용** — 트랜잭션 범위가 JPA와 분리됨
+
+---
+
+## 2. 아키텍처
+
+### 파일 구조
 ```
 rank/
-├── api/
-│   ├── PersonalRankApi.java              # 개인 랭킹 Swagger 문서
-│   ├── DepartmentRankApi.java            # 학과 랭킹 Swagger 문서
-│   └── SeasonRankApi.java                # 시즌 랭킹 Swagger 문서
-├── controller/
-│   ├── PersonalRankController.java       # /api/v1/rank/personal/*
-│   ├── DepartmentRankController.java     # /api/v1/rank/department/*
-│   └── SeasonRankController.java         # /api/v1/rank/season/*
+├── api/                  # Swagger 문서 (PersonalRankApi, DepartmentRankApi, SeasonRankApi)
+├── controller/           # PersonalRank, DepartmentRank, SeasonRank 컨트롤러
 ├── domain/
-│   ├── UserRanking.java                  # 개인 랭킹 엔티티
-│   ├── DepartmentRanking.java            # 학과 랭킹 엔티티
-│   ├── Season.java                       # 시즌 엔티티 (기간 검증 포함)
-│   ├── SeasonRankingSnapshot.java        # 시즌 종료 시 확정 랭킹 스냅샷
-│   ├── RankType.java                     # enum: OVERALL, DEPARTMENT
-│   ├── RankingType.java                  # enum: DAILY, WEEKLY, MONTHLY
-│   ├── SeasonType.java                   # enum: SPRING_SEMESTER, SUMMER_VACATION, FALL_SEMESTER, WINTER_VACATION
-│   └── SeasonStatus.java                 # enum: ACTIVE, ENDED
+│   ├── UserRanking        # 확정 개인 랭킹 (rank, totalMillis, rankingType, calculatedAt)
+│   ├── DepartmentRanking  # 확정 학과 랭킹 (department, rank, totalMillis, rankingType, calculatedAt)
+│   ├── Season             # 시즌 (name, seasonType, startDate, endDate, status)
+│   ├── SeasonRankingSnapshot  # 시즌 종료 시 불변 스냅샷 (seasonId, userId, rankType, finalRank, finalTotalMillis)
+│   └── enums: RankType(OVERALL|DEPARTMENT), RankingType(DAILY|WEEKLY|MONTHLY),
+│              SeasonType(4시즌), SeasonStatus(ACTIVE|ENDED)
 ├── dto/
-│   ├── PersonalRankingTemp.java          # JPQL 프로젝션용 DTO (userId, nickname, department, totalMillis, ranking)
-│   ├── DepartmentRankingTemp.java        # 학과 집계용 DTO
-│   └── response/
-│       ├── PersonalRankingResponse.java      # topRanks + myRanking
-│       ├── PersonalRankingEntryResponse.java # 개인 랭킹 항목
-│       ├── DepartmentRankingResponse.java    # topRanks + myDepartmentRanking
-│       ├── DepartmentRankingEntryResponse.java # 학과 랭킹 항목
-│       └── SeasonRankingResponse.java        # 시즌 랭킹 (seasonId, seasonName, dates, rankings)
+│   ├── PersonalRankingTemp    # JPQL 프로젝션 DTO (Department enum/String 양쪽 생성자)
+│   ├── DepartmentRankingTemp  # 학과 집계 DTO
+│   └── response/              # PersonalRankingResponse, DepartmentRankingResponse,
+│                                SeasonRankingResponse, SeasonDepartmentRankingResponse
 ├── repository/
-│   ├── UserRankingRepository.java            # 개인 랭킹 JPQL 쿼리
-│   ├── DepartmentRankingRepository.java      # 학과 랭킹 Native Query (CTE 사용)
-│   ├── SeasonRepository.java                 # 시즌 조회 (날짜 범위)
-│   └── SeasonRankingSnapshotRepository.java  # 스냅샷 조회/존재 확인
+│   ├── UserRankingRepository           # JPQL — 확정 랭킹 + 시즌 월간/일간 합산 쿼리
+│   ├── DepartmentRankingRepository     # Native CTE — 25개 학과 랭킹 (MySQL 8+)
+│   ├── SeasonRepository                # Native — 날짜 범위로 시즌 조회
+│   └── SeasonRankingSnapshotRepository # 스냅샷 조회/존재 확인/학과 집계
 ├── service/
-│   ├── PersonalRankService.java              # 개인 랭킹 조회 (실시간/확정)
-│   ├── DepartmentRankService.java            # 학과 랭킹 조회
-│   ├── SeasonRankService.java                # 시즌 랭킹 계산 (월간+일간+실시간 병합)
-│   ├── SeasonService.java                    # 시즌 생명주기 관리 (@Cacheable)
-│   ├── SeasonSnapshotService.java            # 스냅샷 생성 (@Retryable, 3회, 5초 backoff)
-│   └── SeasonSnapshotBatchService.java       # JDBC 배치 인서트 (2000건 청크)
+│   ├── PersonalRankService         # 개인 랭킹 조회 (실시간/확정)
+│   ├── DepartmentRankService       # 학과 랭킹 조회
+│   ├── SeasonRankService           # 시즌 랭킹 (월간+일간+실시간 병합, mergeAndRank)
+│   ├── SeasonService               # 시즌 생명주기 (@Cacheable activeSeason)
+│   ├── SeasonSnapshotService       # 스냅샷 생성 (@Retryable 3회, 5초 backoff)
+│   └── SeasonSnapshotBatchService  # JDBC 배치 인서트 (2000건 청크)
 └── scheduler/
-    ├── RankingSchedulerService.java          # 일간/주간/월간 랭킹 스케줄러
-    └── SeasonTransitionScheduler.java        # 시즌 전환 스케줄러
+    ├── RankingSchedulerService     # 일간/주간/월간 랭킹 확정
+    └── SeasonTransitionScheduler   # 시즌 전환 + 스냅샷 생성
 ```
 
-## 핵심 개념
-
 ### 이중 랭킹 구조
-- **실시간 랭킹**: `StudySessionRepository`에서 직접 계산 (현재 기간)
-- **확정 랭킹**: 기간 종료 후 `UserRanking`/`DepartmentRanking`에 저장 (과거 기간)
 
-컨트롤러에서 `date` 파라미터 유무로 분기:
-- `date` 없음 → 현재 기간 실시간 랭킹
-- `date` 있음 → 해당 날짜의 확정 랭킹
+| 구분 | 데이터 소스 | 트리거 |
+|------|------------|--------|
+| 실시간 | `StudySessionRepository` Native Query (진행중 세션 포함) | `date` 파라미터 없을 때 |
+| 확정 | `UserRanking` / `DepartmentRanking` 테이블 | `date` 파라미터 있을 때 |
 
-### 시즌 시스템
-4개 시즌이 순환:
-| SeasonType | 기간 |
-|---|---|
-| SPRING_SEMESTER | 3/1 ~ 6/30 |
-| SUMMER_VACATION | 7/1 ~ 8/31 |
-| FALL_SEMESTER | 9/1 ~ 12/31 |
-| WINTER_VACATION | 1/1 ~ 2/28(29) |
+### 시즌 랭킹 계산 (3단 병합)
+1. **확정 월간 합산** — seasonStart ~ (currentMonth - 1)의 월간 랭킹 SUM
+2. **현재 월 일간 합산** — 1일 ~ (today - 1)의 일간 랭킹 SUM
+3. **오늘 실시간** — `StudySessionRepository`에서 직접 계산
+4. **병합** — userId/department별 GROUP BY → totalMillis SUM → RANK() (동점 처리)
+
+### 동점 처리 (MySQL RANK 시맨틱)
+```
+Millis: 1000, 1000, 800 → Rank: 1, 1, 3  (2 건너뜀)
+```
+
+### 학과 랭킹
+- 학과별 상위 30명 공부시간 합산 → 전체 학과 간 RANK
+- CTE UNION ALL로 25개 학과 전부 포함 (0시간 학과도)
+- 응답에서는 0시간 학과를 `topRanks`에서 제외하되, **본인 학과는 0이어도 항상 표시**
+
+### Fallback
+랭킹에 없는 사용자/학과: `rank = listSize + 1`, `totalMillis = 0`
+
+---
+
+## 3. 빌드 & 테스트
 
-시즌 랭킹 = 확정 월간 합산 + 현재 월 일간 합산 + 오늘 실시간 데이터를 `mergeAndRank()`로 병합.
+### 스케줄러 실행 순서 (겹침 방지)
+| Cron | 작업 |
+|------|------|
+| `5 0 0 * * *` | 일간 랭킹 확정 (매일 00:00:05) |
+| `0 1 0 ? * MON` | 주간 랭킹 확정 (월요일 00:01) |
+| `0 2 0 1 * ?` | 월간 랭킹 확정 (매월 1일 00:02) |
+| `0 5 0 * * *` | 시즌 전환 확인 (매일 00:05) |
 
-### 학과 랭킹 계산
-- 학과별 상위 30명의 공부 시간을 합산
-- Native Query + CTE로 25개 학과 처리
-- 공부 시간 0인 학과는 topRanks에서 제외하되, 본인 학과는 0이어도 표시
+### 단위 테스트 (`unit/rank/service/`)
+- `PersonalRankServiceTest` — 실시간/확정 랭킹, fallback, 빈 리스트
+- `DepartmentRankServiceTest` — 0시간 필터링, 본인 학과 포함, 학과명 변환
+- `SeasonServiceTest` — 시즌 생성/전환, 4시즌 순환, 윤년 처리
+- `SeasonSnapshotServiceRetryTest` — 재시도 3회, 중복 방지(idempotency)
 
-### Fallback 로직
-랭킹에 포함되지 않은 사용자: `rank = listSize + 1`, `totalMillis = 0`
+### 통합 테스트 (`integration/rank/controller/`)
+- `DepartmentRankControllerIntegrationTest` — E2E, 인증, 데이터 격리
+- `SeasonRankControllerIntegrationTest` — 시즌 CRUD, 스냅샷 집계, 에러 케이스
 
-## 스케줄러 실행 시점
+---
 
-| 작업 | Cron | 설명 |
+## 4. 도메인 컨텍스트
+
+### 시즌 시스템
+
+| SeasonType | 기간 | 비고 |
+|------------|------|------|
+| `SPRING_SEMESTER` | 3/1 ~ 6/30 | |
+| `SUMMER_VACATION` | 7/1 ~ 8/31 | |
+| `FALL_SEMESTER` | 9/1 ~ 12/31 | |
+| `WINTER_VACATION` | 1/1 ~ 2/28(29) | 윤년 처리 |
+
+**생명주기**: ACTIVE 시즌 1개만 존재 → 스케줄러가 endDate+1 감지 → 캐시 evict → 스냅샷 생성(Retry 3회) → 현재 시즌 ENDED → 다음 시즌 ACTIVE
+
+### 에러 코드
+| 코드 | 이름 | 설명 |
 |------|------|------|
-| 일간 랭킹 계산 | `5 0 0 * * *` | 매일 00:00:05 |
-| 주간 랭킹 계산 | `0 1 0 ? * MON` | 매주 월요일 00:01 |
-| 월간 랭킹 계산 | `0 2 0 1 * ?` | 매월 1일 00:02 |
-| 시즌 전환 확인 | `0 5 0 * * *` | 매일 00:05 |
-
-## API 엔드포인트
-
-### 개인 랭킹 (`/api/v1/rank/personal`)
-- `GET /daily?date=` — 일간 개인 랭킹
-- `GET /weekly?date=` — 주간 개인 랭킹 (월요일 기준)
-- `GET /monthly?date=` — 월간 개인 랭킹 (1일 기준)
-
-### 학과 랭킹 (`/api/v1/rank/department`)
-- `GET /daily?date=` — 일간 학과 랭킹
-- `GET /weekly?date=` — 주간 학과 랭킹
-- `GET /monthly?date=` — 월간 학과 랭킹
-
-### 시즌 랭킹 (`/api/v1/rank/season`)
-- `GET /current` — 현재 시즌 전체 랭킹
-- `GET /current/department?department=` — 현재 시즌 학과별 랭킹
-- `GET /{seasonId}` — 종료된 시즌 전체 랭킹
-- `GET /{seasonId}/department?department=` — 종료된 시즌 학과별 랭킹
-
-## 테스트
-
-### Unit Tests
-- `PersonalRankServiceTest` — 실시간/확정 랭킹, fallback, 동점 처리, 빈 리스트
-- `DepartmentRankServiceTest` — 0시간 학과 필터링, 본인 학과 포함, 학과명 변환
-- `SeasonRankServiceTest` — 데이터 병합, 동점 처리, 스냅샷 조회, 예외(SEASON_NOT_FOUND, SEASON_NOT_ENDED)
-- `SeasonServiceTest` — 시즌 생성/전환, 4개 시즌 순환, 윤년 처리, 날짜 검증
-- `SeasonSnapshotServiceRetryTest` — 재시도 메커니즘, 중복 방지
-
-### Integration Tests
-- `DepartmentRankControllerIntegrationTest` — E2E API 테스트, 인증, 데이터 격리
-
-## 개발 시 주의사항
-
-1. 랭킹 쿼리가 복잡하므로 `StudySessionRepository`의 Native Query도 함께 확인할 것
-2. 시즌 전환 시 `activeSeason` 캐시가 evict됨 — 캐시 관련 코드 수정 시 주의
-3. `SeasonSnapshotBatchService`는 JDBC 직접 사용 — JPA와 별도 트랜잭션
-4. `DepartmentRankingRepository`의 Native Query는 CTE 사용 — MySQL 8+ 필수
-5. `PersonalRankingTemp`에 Department enum/String 두 가지 생성자 존재 — JPQL 프로젝션 방식에 따라 다름
+| SE001 | SEASON_NOT_FOUND | 시즌 미발견 |
+| SE002 | SEASON_NOT_ENDED | ACTIVE 시즌을 종료 시즌으로 조회 시도 |
+| SE003 | SEASON_INVALID_DATE_RANGE | endDate ≤ startDate |
+| SE004 | SEASON_ALREADY_ENDED | 이미 종료된 시즌 재종료 시도 |
+| SE005 | NO_ACTIVE_SEASON | 활성 시즌 없음 |
+
+### API 엔드포인트
+
+**개인 랭킹** (`/api/v1/rank/personal`): `GET /daily`, `/weekly`, `/monthly` — 모두 `?date=` 선택적
+**학과 랭킹** (`/api/v1/rank/department`): 동일 구조
+**시즌 랭킹** (`/api/v1/rank/season`): `GET /current`, `/current/department?department=`, `/{seasonId}`, `/{seasonId}/department?department=`
+
+모든 엔드포인트: `@PreAuthorize(USER)` + `@AssignUserId`
+
+---
+
+## 5. 코딩 컨벤션
+
+1. `PersonalRankingTemp`에 Department enum/String 두 생성자 존재 — JPQL 프로젝션 방식에 따라 선택
+2. 응답은 `ResponseUtil.createSuccessResponse(data)` 표준 형식
+3. 읽기 전용 서비스 메서드에 `@Transactional(readOnly = true)`
+4. 새 에러 코드는 `SE` 접두사 + `ExceptionType` enum에 추가
+5. 랭킹 계산 로직 수정 시 동점 처리(RANK 시맨틱) 유지 확인
diff --git a/src/main/java/com/gpt/geumpumtabackend/rank/dto/response/SeasonDepartmentRankingResponse.java b/src/main/java/com/gpt/geumpumtabackend/rank/dto/response/SeasonDepartmentRankingResponse.java
index 030575e..9ab8d23 100644
--- a/src/main/java/com/gpt/geumpumtabackend/rank/dto/response/SeasonDepartmentRankingResponse.java
+++ b/src/main/java/com/gpt/geumpumtabackend/rank/dto/response/SeasonDepartmentRankingResponse.java
@@ -1,11 +1,9 @@
 package com.gpt.geumpumtabackend.rank.dto.response;
 
 import com.gpt.geumpumtabackend.rank.domain.Season;
-import com.gpt.geumpumtabackend.rank.dto.DepartmentRankingTemp;
 
 import java.time.LocalDate;
 import java.util.List;
-import java.util.stream.Collectors;
 
 
 public record SeasonDepartmentRankingResponse(

From 3d4610f22c4cf559d4edb48ceb96fed07deec64a Mon Sep 17 00:00:00 2001
From: Juhye0k <kyoung1678@naver.com>
Date: Fri, 3 Apr 2026 16:45:35 +0900
Subject: [PATCH 3/3] =?UTF-8?q?chore=20:=20=EB=B6=88=ED=95=84=EC=9A=94?=
 =?UTF-8?q?=ED=95=9C=20import=20=EC=82=AD=EC=A0=9C?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../com/gpt/geumpumtabackend/board/dto/BoardListResponse.java    | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/main/java/com/gpt/geumpumtabackend/board/dto/BoardListResponse.java b/src/main/java/com/gpt/geumpumtabackend/board/dto/BoardListResponse.java
index 79ed3ee..be10999 100644
--- a/src/main/java/com/gpt/geumpumtabackend/board/dto/BoardListResponse.java
+++ b/src/main/java/com/gpt/geumpumtabackend/board/dto/BoardListResponse.java
@@ -3,7 +3,6 @@
 import com.gpt.geumpumtabackend.board.domain.Board;
 
 import java.time.LocalDateTime;
-import java.util.List;
 
 public record BoardListResponse(
         Long id,