From 262cd044b921e3c2fa1c49fae21238d4eb1877a8 Mon Sep 17 00:00:00 2001 From: twq110 Date: Mon, 6 Apr 2026 14:55:32 +0900 Subject: [PATCH] =?UTF-8?q?[AI]=20[FEAT]=20=EA=B3=84=EC=A0=95=20ID=20?= =?UTF-8?q?=EC=83=9D=EC=84=B1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../compare_transformer_feature_schemas.py | 498 ++++++++++++++++++ ...mer_refactor_change_analysis_2026-04-03.md | 119 +++++ .../run_config.json | 23 + AI/config/trading.default.json | 1 + AI/config/trading.py | 4 + AI/libs/database/repository.py | 138 ++++- AI/pipelines/daily_routine.py | 5 +- schema.sql | 58 +- 8 files changed, 820 insertions(+), 26 deletions(-) create mode 100644 AI/backtests/compare_transformer_feature_schemas.py create mode 100644 AI/backtests/out/transformer_refactor_change_analysis_2026-04-03.md create mode 100644 AI/backtests/out/transformer_schema_compare_smoke/run_config.json diff --git a/AI/backtests/compare_transformer_feature_schemas.py b/AI/backtests/compare_transformer_feature_schemas.py new file mode 100644 index 00000000..3edc5563 --- /dev/null +++ b/AI/backtests/compare_transformer_feature_schemas.py @@ -0,0 +1,498 @@ +from __future__ import annotations + +import argparse +import json +import os +import pickle +import random +import sys +from dataclasses import asdict, dataclass +from datetime import datetime +from pathlib import Path +from typing import Any + +import numpy as np +import pandas as pd +import tensorflow as tf +from sklearn.model_selection import train_test_split +from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau + + +PROJECT_ROOT = Path(__file__).resolve().parents[2] +if str(PROJECT_ROOT) not in sys.path: + sys.path.append(str(PROJECT_ROOT)) + +from AI.modules.signal.core.data_loader import DataLoader +from AI.modules.signal.models.transformer.architecture import build_transformer_model + + +PRE307_DYNAMIC_23 = [ + "log_return", + "open_ratio", + "high_ratio", + "low_ratio", + "vol_change", + "ma_5_ratio", + "ma_20_ratio", + "ma_60_ratio", + "rsi", + "macd_ratio", + "bb_position", + "us10y", + "yield_spread", + "vix_close", + "dxy_close", + "credit_spread_hy", + "nh_nl_index", + "ma200_pct", + "sentiment_score", + "risk_keyword_cnt", + "per", + "pbr", + "roe", +] + +CURRENT_FIXED_17 = [ + "log_return", + "open_ratio", + "high_ratio", + "low_ratio", + "vol_change", + "ma5_ratio", + "ma20_ratio", + "ma60_ratio", + "rsi", + "macd_ratio", + "bb_position", + "week_ma20_ratio", + "week_rsi", + "week_bb_pos", + "week_vol_change", + "month_ma12_ratio", + "month_rsi", +] + + +@dataclass +class TrainArtifacts: + schema_name: str + requested_features: list[str] + effective_features: list[str] + n_features: int + n_samples: int + epochs_ran: int + history_csv: str + summary_json: str + model_path: str + scaler_path: str + best_val_loss: float | None + best_epoch: int | None + final_val_loss: float | None + + +def _set_global_seed(seed: int) -> None: + random.seed(seed) + np.random.seed(seed) + tf.keras.utils.set_random_seed(seed) + try: + tf.config.experimental.enable_op_determinism() + except Exception: + pass + + +def _safe_float(value: Any) -> float | None: + if value is None: + return None + try: + return float(value) + except (TypeError, ValueError): + return None + + +def _parse_horizons(raw_horizons: str) -> list[int]: + horizons = [int(item.strip()) for item in raw_horizons.split(",") if item.strip()] + if not horizons: + raise ValueError("prediction horizons must not be empty.") + return horizons + + +def _prepare_output_dir(output_dir: str | None) -> Path: + if output_dir: + target = Path(output_dir).resolve() + else: + stamp = datetime.now().strftime("%Y%m%d_%H%M%S") + target = (PROJECT_ROOT / "AI" / "backtests" / "out" / f"transformer_schema_compare_{stamp}").resolve() + target.mkdir(parents=True, exist_ok=True) + return target + + +def _fit_single_schema( + *, + schema_name: str, + requested_features: list[str], + case_output_dir: Path, + db_name: str, + lookback: int, + horizons: list[int], + data_start_date: str, + train_end_date: str, + test_size: float, + split_seed: int, + global_seed: int, + epochs: int, + batch_size: int, + patience: int, + lr_patience: int, + lr_factor: float, + min_lr: float, + verbose: int, +) -> TrainArtifacts: + _set_global_seed(global_seed) + tf.keras.backend.clear_session() + + loader = DataLoader(db_name=db_name, lookback=lookback, horizons=horizons) + full_df = loader.load_data_from_db(start_date=data_start_date) + raw_df = full_df[full_df["date"] <= train_end_date].copy() + if raw_df.empty: + raise ValueError(f"[{schema_name}] no rows found in selected training range.") + + X_ts, X_ticker, X_sector, y_class, _, info = loader.create_dataset(raw_df, feature_columns=requested_features) + if len(y_class) == 0: + raise ValueError(f"[{schema_name}] dataset is empty after preprocessing.") + + ( + X_ts_train, + X_ts_val, + X_tick_train, + X_tick_val, + X_sec_train, + X_sec_val, + y_train, + y_val, + ) = train_test_split( + X_ts, + X_ticker, + X_sector, + y_class, + test_size=test_size, + shuffle=True, + random_state=split_seed, + ) + + model = build_transformer_model( + input_shape=(X_ts.shape[1], X_ts.shape[2]), + n_tickers=info["n_tickers"], + n_sectors=info["n_sectors"], + n_outputs=len(info.get("horizons", horizons)), + ) + + model.compile( + optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4), + loss="binary_crossentropy", + metrics=["accuracy"], + ) + + case_output_dir.mkdir(parents=True, exist_ok=True) + model_path = case_output_dir / "multi_horizon_model.keras" + scaler_path = case_output_dir / "multi_horizon_scaler.pkl" + + callbacks = [ + ModelCheckpoint( + filepath=str(model_path), + monitor="val_loss", + save_best_only=True, + verbose=1 if verbose > 0 else 0, + ), + EarlyStopping( + monitor="val_loss", + patience=patience, + restore_best_weights=True, + ), + ReduceLROnPlateau( + monitor="val_loss", + factor=lr_factor, + patience=lr_patience, + min_lr=min_lr, + verbose=1 if verbose > 0 else 0, + ), + ] + + history = model.fit( + x=[X_ts_train, X_tick_train, X_sec_train], + y=y_train, + validation_data=([X_ts_val, X_tick_val, X_sec_val], y_val), + epochs=epochs, + batch_size=batch_size, + shuffle=True, + callbacks=callbacks, + verbose=verbose, + ) + + with scaler_path.open("wb") as handle: + pickle.dump(info["scaler"], handle) + + history_df = pd.DataFrame(history.history) + history_df.insert(0, "epoch", np.arange(1, len(history_df) + 1)) + history_csv = case_output_dir / "history.csv" + history_df.to_csv(history_csv, index=False) + + val_loss_series = history_df["val_loss"] if "val_loss" in history_df.columns else pd.Series(dtype="float64") + if val_loss_series.empty: + best_val_loss = None + best_epoch = None + final_val_loss = None + else: + min_idx = int(val_loss_series.idxmin()) + best_val_loss = _safe_float(val_loss_series.iloc[min_idx]) + best_epoch = int(history_df.loc[min_idx, "epoch"]) + final_val_loss = _safe_float(val_loss_series.iloc[-1]) + + summary = { + "schema_name": schema_name, + "requested_features": requested_features, + "effective_features": info.get("feature_names", []), + "n_features": int(info.get("n_features", 0)), + "n_samples": int(len(y_class)), + "lookback": int(lookback), + "horizons": [int(h) for h in info.get("horizons", horizons)], + "train_rows": int(len(raw_df)), + "train_date_min": str(raw_df["date"].min()), + "train_date_max": str(raw_df["date"].max()), + "epochs_ran": int(len(history_df)), + "best_val_loss": best_val_loss, + "best_epoch": best_epoch, + "final_val_loss": final_val_loss, + "history_csv": str(history_csv), + "model_path": str(model_path), + "scaler_path": str(scaler_path), + } + + summary_json = case_output_dir / "summary.json" + summary_json.write_text(json.dumps(summary, indent=2, ensure_ascii=False), encoding="utf-8") + + return TrainArtifacts( + schema_name=schema_name, + requested_features=list(requested_features), + effective_features=list(info.get("feature_names", [])), + n_features=int(info.get("n_features", 0)), + n_samples=int(len(y_class)), + epochs_ran=int(len(history_df)), + history_csv=str(history_csv), + summary_json=str(summary_json), + model_path=str(model_path), + scaler_path=str(scaler_path), + best_val_loss=best_val_loss, + best_epoch=best_epoch, + final_val_loss=final_val_loss, + ) + + +def _save_curve_plot( + *, + output_dir: Path, + dynamic_df: pd.DataFrame, + fixed_df: pd.DataFrame, +) -> str | None: + try: + import matplotlib.pyplot as plt + except Exception: + return None + + if "val_loss" not in dynamic_df.columns or "val_loss" not in fixed_df.columns: + return None + + figure_path = output_dir / "val_loss_curve.png" + plt.figure(figsize=(10, 6)) + plt.plot(dynamic_df["epoch"], dynamic_df["val_loss"], label="dynamic23") + plt.plot(fixed_df["epoch"], fixed_df["val_loss"], label="fixed17") + plt.xlabel("Epoch") + plt.ylabel("Validation Loss") + plt.title("Transformer Val Loss Curve: dynamic23 vs fixed17") + plt.grid(alpha=0.3) + plt.legend() + plt.tight_layout() + plt.savefig(figure_path, dpi=140) + plt.close() + return str(figure_path) + + +def _write_comparison_report( + *, + output_dir: Path, + dynamic_artifacts: TrainArtifacts, + fixed_artifacts: TrainArtifacts, + curve_csv: Path, + curve_plot: str | None, +) -> Path: + dynamic_best = dynamic_artifacts.best_val_loss + fixed_best = fixed_artifacts.best_val_loss + if dynamic_best is None or fixed_best is None: + best_gap = None + else: + best_gap = fixed_best - dynamic_best + + comparison = { + "dynamic23": asdict(dynamic_artifacts), + "fixed17": asdict(fixed_artifacts), + "best_val_loss_gap_fixed17_minus_dynamic23": best_gap, + "val_curve_csv": str(curve_csv), + "val_curve_plot": curve_plot, + } + comparison_json = output_dir / "comparison_summary.json" + comparison_json.write_text(json.dumps(comparison, indent=2, ensure_ascii=False), encoding="utf-8") + + lines = [ + "# Transformer Feature Schema Retrain Comparison", + "", + f"- dynamic23 best val_loss: {dynamic_artifacts.best_val_loss}", + f"- fixed17 best val_loss: {fixed_artifacts.best_val_loss}", + f"- best val_loss gap (fixed17 - dynamic23): {best_gap}", + f"- dynamic23 epochs ran: {dynamic_artifacts.epochs_ran}", + f"- fixed17 epochs ran: {fixed_artifacts.epochs_ran}", + f"- dynamic23 best epoch: {dynamic_artifacts.best_epoch}", + f"- fixed17 best epoch: {fixed_artifacts.best_epoch}", + "", + "## Artifacts", + f"- curve csv: {curve_csv}", + f"- curve plot: {curve_plot if curve_plot else 'not generated (matplotlib unavailable or val_loss missing)'}", + f"- dynamic23 summary: {dynamic_artifacts.summary_json}", + f"- fixed17 summary: {fixed_artifacts.summary_json}", + f"- comparison json: {comparison_json}", + ] + report_path = output_dir / "comparison_report.md" + report_path.write_text("\n".join(lines), encoding="utf-8") + return report_path + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + description="Retrain transformer with pre-#307 dynamic23 vs current fixed17 and compare val curves.", + ) + parser.add_argument("--db-name", default="db", help="Database connection profile name.") + parser.add_argument("--lookback", type=int, default=60, help="Sequence length.") + parser.add_argument("--horizons", default="1,3,5,7", help="Prediction horizons, comma separated.") + parser.add_argument("--data-start-date", default="2015-01-01", help="Data fetch start date (YYYY-MM-DD).") + parser.add_argument("--train-end-date", default="2023-12-31", help="Training cutoff date (YYYY-MM-DD).") + parser.add_argument("--epochs", type=int, default=50, help="Max epochs.") + parser.add_argument("--batch-size", type=int, default=32, help="Batch size.") + parser.add_argument("--test-size", type=float, default=0.2, help="Validation split ratio.") + parser.add_argument("--split-seed", type=int, default=42, help="Seed for train/val split.") + parser.add_argument("--global-seed", type=int, default=42, help="Global random seed.") + parser.add_argument("--patience", type=int, default=10, help="EarlyStopping patience.") + parser.add_argument("--lr-patience", type=int, default=5, help="ReduceLROnPlateau patience.") + parser.add_argument("--lr-factor", type=float, default=0.5, help="ReduceLROnPlateau factor.") + parser.add_argument("--min-lr", type=float, default=1e-6, help="Minimum learning rate.") + parser.add_argument("--verbose", type=int, default=2, choices=[0, 1, 2], help="Keras fit verbosity.") + parser.add_argument( + "--output-dir", + default=None, + help="Optional output directory. Default: AI/backtests/out/transformer_schema_compare_", + ) + return parser + + +def main() -> int: + parser = build_parser() + args = parser.parse_args() + + output_dir = _prepare_output_dir(args.output_dir) + horizons = _parse_horizons(args.horizons) + + run_config = { + "db_name": args.db_name, + "lookback": args.lookback, + "horizons": horizons, + "data_start_date": args.data_start_date, + "train_end_date": args.train_end_date, + "epochs": args.epochs, + "batch_size": args.batch_size, + "test_size": args.test_size, + "split_seed": args.split_seed, + "global_seed": args.global_seed, + "patience": args.patience, + "lr_patience": args.lr_patience, + "lr_factor": args.lr_factor, + "min_lr": args.min_lr, + "verbose": args.verbose, + "output_dir": str(output_dir), + } + (output_dir / "run_config.json").write_text(json.dumps(run_config, indent=2, ensure_ascii=False), encoding="utf-8") + + print(f"[SchemaCompare] Output directory: {output_dir}") + print("[SchemaCompare] Training case 1/2: dynamic23 (pre-#307 schema)") + dynamic_artifacts = _fit_single_schema( + schema_name="dynamic23", + requested_features=PRE307_DYNAMIC_23, + case_output_dir=output_dir / "dynamic23", + db_name=args.db_name, + lookback=args.lookback, + horizons=horizons, + data_start_date=args.data_start_date, + train_end_date=args.train_end_date, + test_size=args.test_size, + split_seed=args.split_seed, + global_seed=args.global_seed, + epochs=args.epochs, + batch_size=args.batch_size, + patience=args.patience, + lr_patience=args.lr_patience, + lr_factor=args.lr_factor, + min_lr=args.min_lr, + verbose=args.verbose, + ) + + print("[SchemaCompare] Training case 2/2: fixed17 (current schema)") + fixed_artifacts = _fit_single_schema( + schema_name="fixed17", + requested_features=CURRENT_FIXED_17, + case_output_dir=output_dir / "fixed17", + db_name=args.db_name, + lookback=args.lookback, + horizons=horizons, + data_start_date=args.data_start_date, + train_end_date=args.train_end_date, + test_size=args.test_size, + split_seed=args.split_seed, + global_seed=args.global_seed, + epochs=args.epochs, + batch_size=args.batch_size, + patience=args.patience, + lr_patience=args.lr_patience, + lr_factor=args.lr_factor, + min_lr=args.min_lr, + verbose=args.verbose, + ) + + dynamic_df = pd.read_csv(dynamic_artifacts.history_csv) + fixed_df = pd.read_csv(fixed_artifacts.history_csv) + + curve_df = pd.DataFrame({"epoch": np.arange(1, max(len(dynamic_df), len(fixed_df)) + 1)}) + curve_df = curve_df.merge( + dynamic_df[["epoch", "val_loss"]].rename(columns={"val_loss": "dynamic23_val_loss"}), + on="epoch", + how="left", + ) + curve_df = curve_df.merge( + fixed_df[["epoch", "val_loss"]].rename(columns={"val_loss": "fixed17_val_loss"}), + on="epoch", + how="left", + ) + curve_csv = output_dir / "val_curve_comparison.csv" + curve_df.to_csv(curve_csv, index=False) + + curve_plot = _save_curve_plot(output_dir=output_dir, dynamic_df=dynamic_df, fixed_df=fixed_df) + report_path = _write_comparison_report( + output_dir=output_dir, + dynamic_artifacts=dynamic_artifacts, + fixed_artifacts=fixed_artifacts, + curve_csv=curve_csv, + curve_plot=curve_plot, + ) + + print("[SchemaCompare] Completed.") + print(f"[SchemaCompare] Comparison report: {report_path}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/AI/backtests/out/transformer_refactor_change_analysis_2026-04-03.md b/AI/backtests/out/transformer_refactor_change_analysis_2026-04-03.md new file mode 100644 index 00000000..a55e7e81 --- /dev/null +++ b/AI/backtests/out/transformer_refactor_change_analysis_2026-04-03.md @@ -0,0 +1,119 @@ +# Transformer 변경점 심층 분석 (기준: 2026-04-03) + +## 1) 분석 범위 +- 비교 기준: `20260314 #307 daily routine 코드 리펙토링` (merge commit `5d752d2`) 전후 + 이후 관련 커밋 +- 확인 파일: + - `AI/modules/signal/models/transformer/train.py` + - `AI/modules/signal/core/data_loader.py` + - `AI/modules/signal/models/transformer/wrapper.py` + - `AI/pipelines/components/model_manager.py` + - `AI/modules/signal/core/artifact_paths.py` + - `AI/modules/finder/screener.py` + +## 2) 핵심 타임라인 (Transformer 관점) +- `6fd3b39` (`#307` 브랜치 내부): + - `train.py` import 경로 변경 + - `PatchTST.architecture` -> `transformer.architecture` + - 기본 저장 경로를 `.../transformer/tests/`로 변경 +- `c9ed165` (`#307` 브랜치 내부): + - `TRANSFORMER_TRAIN_FEATURES` 17개 고정 도입 + - `create_dataset(..., feature_columns=TRANSFORMER_TRAIN_FEATURES)` 도입 + - `DataLoader`에서 `add_multi_timeframe_features()` 실제 호출 시작 +- `867999e` (`#307` 브랜치 내부): + - wrapper에 입력 길이 검증 추가 (`len(df) < seq_len` 방어) +- `5d752d2` (main에 squash merge): 위 3개 변화가 사실상 반영 +- `96a4640`: + - config 구조화(`trading.py`, `trading.default.json`) +- `ae48e58`: + - legacy h5 체크포인트의 input shape 추론 로직 추가 +- `92bc3b3`: + - artifact path 해석 로직(`artifact_paths.py`) 도입 + - `AI_MODEL_WEIGHTS_DIR` 환경변수 기반 아티팩트 루트 전환 가능 + +## 3) 피처 스키마 변화 (가장 큰 변화) + +### 이전(동적 후보, `#307` 직전 `5d752d2^`) +`data_loader.py`의 `potential_features` 23개: +- `log_return, open_ratio, high_ratio, low_ratio, vol_change` +- `ma_5_ratio, ma_20_ratio, ma_60_ratio` +- `rsi, macd_ratio, bb_position` +- `us10y, yield_spread, vix_close, dxy_close, credit_spread_hy` +- `nh_nl_index, ma200_pct` +- `sentiment_score, risk_keyword_cnt` +- `per, pbr, roe` + +### 현재(고정 17, `train.py`) +- `log_return, open_ratio, high_ratio, low_ratio, vol_change` +- `ma5_ratio, ma20_ratio, ma60_ratio` +- `rsi, macd_ratio, bb_position` +- `week_ma20_ratio, week_rsi, week_bb_pos, week_vol_change` +- `month_ma12_ratio, month_rsi` + +### Set 비교 +- 공통 8개: + - `log_return, open_ratio, high_ratio, low_ratio, vol_change, rsi, macd_ratio, bb_position` +- 이전 대비 제거 15개: + - `ma_5_ratio, ma_20_ratio, ma_60_ratio` + - `us10y, yield_spread, vix_close, dxy_close, credit_spread_hy` + - `nh_nl_index, ma200_pct` + - `sentiment_score, risk_keyword_cnt` + - `per, pbr, roe` +- 현재에서 신규 9개: + - `ma5_ratio, ma20_ratio, ma60_ratio` + - `week_ma20_ratio, week_rsi, week_bb_pos, week_vol_change` + - `month_ma12_ratio, month_rsi` + +## 4) 중요한 정합성 포인트 + +### (A) MA 피처 네이밍 불일치 이슈의 구조 +- `add_technical_indicators()`는 `ma{window}_ratio` (예: `ma5_ratio`) 생성 +- 과거 동적 후보에는 `ma_5_ratio` 형태가 들어가 있어, 동적 경로에서는 MA 3개가 빠질 여지가 있었음 +- `#307` 이후 고정 17로 `ma5_ratio`를 직접 지정하면서 이 불일치가 실질 해소됨 + +### (B) 멀티타임프레임 피처의 “정의는 있었지만 사용은 안 됨” -> “사용 시작” +- `add_multi_timeframe_features()` 함수 자체는 이전에도 존재 +- 다만 `DataLoader.create_dataset()`에서 실제 호출된 것은 `#307` 라인에서 시작 +- 즉, 주봉/월봉 6개 피처는 `#307` 이후 학습 입력에 본격 반영 + +### (C) 아티팩트 경로/로드 방식 변화 +- `#307` 이후 + 후속 커밋에서 아래가 바뀜: + - 테스트/프로덕션 파일명 분기 (`*_test`, `*_prod`) + - 환경변수 기반 weights 루트 전환 + - legacy `.keras`(실제 h5) 로드 폴백 강화 +- 따라서 “같은 코드라도 어떤 체크포인트를 읽었는지”가 결과에 크게 영향 가능 + +## 5) 현재 실제 스케일러 상태 확인 (로컬 파일 기반) + +확인 파일: +- `AI/data/weights/transformer/tests/multi_horizon_scaler_test.pkl` +- `AI/data/weights/transformer/prod/multi_horizon_scaler_prod.pkl` + +공통 결과: +- `n_features_in_ = 17` +- `feature_names_in_`가 현재 고정 17과 동일 + +관찰 포인트: +- `vol_change`, `week_vol_change`의 max가 매우 큼 (긴 꼬리) + - test: `vol_change max ~15420`, `week_vol_change max ~25361` + - prod: `vol_change max ~15420`, `week_vol_change max ~120508` +- MinMaxScaler 사용 시 극단치가 있으면 대부분 샘플이 좁은 구간으로 압축될 수 있음 + +## 6) “val_loss=0.690 정체”에 대한 코드 관점 해석 +- BCE에서 `0.690~0.693` 고정은 보통 확률 0.5 근처(랜덤 수준) 신호 +- 이번 코드 이력에서 그럴 만한 강한 변화는 아래 3가지: + 1. 입력 스키마가 사실상 재정의됨(23 동적 혼합 -> 17 고정 기술/멀티타임프레임) + 2. MA 네이밍 정합성 정리로 실제 입력 컬럼 조합이 바뀜 + 3. 체크포인트/스케일러 경로 분기 변경으로 다른 아티팩트를 읽었을 가능성 + +즉, “단순 리팩토링”이 아니라 **학습 입력 분포와 로딩 경로가 동시에 바뀐 리팩토링**에 가까움. + +## 7) 추가 확인 시도 결과 +- DB 기반 재현 실행은 현재 환경에서 실패: + - `localhost:15432` PostgreSQL 연결 거부 +- 따라서 샘플 수/실제 선택 컬럼/라벨 분포를 런타임으로 재현하지는 못했고, 코드/커밋/스케일러 파일 단서 중심으로 분석함. + +## 8) 스크리너 확장 관련 별도 관찰 (현재 코드) +- `AI/config/trading.py`에 확장 필드(`include_tickers`, `exclude_tickers`, `sticky_slots`, 가중치 등)는 존재 +- `AI/modules/finder/screener.py`의 쿼리에는 아직 미반영 +- 따라서 “GEV 같은 특정 티커 보존/우선” 요구는 현재 config만으로는 완전 제어 불가 + diff --git a/AI/backtests/out/transformer_schema_compare_smoke/run_config.json b/AI/backtests/out/transformer_schema_compare_smoke/run_config.json new file mode 100644 index 00000000..d9a0fa5d --- /dev/null +++ b/AI/backtests/out/transformer_schema_compare_smoke/run_config.json @@ -0,0 +1,23 @@ +{ + "db_name": "db", + "lookback": 60, + "horizons": [ + 1, + 3, + 5, + 7 + ], + "data_start_date": "2015-01-01", + "train_end_date": "2023-12-31", + "epochs": 1, + "batch_size": 32, + "test_size": 0.2, + "split_seed": 42, + "global_seed": 42, + "patience": 10, + "lr_patience": 5, + "lr_factor": 0.5, + "min_lr": 1e-06, + "verbose": 2, + "output_dir": "C:\\Users\\jaebin\\Documents\\GitHub\\sisc-web\\AI\\backtests\\out\\transformer_schema_compare_smoke" +} \ No newline at end of file diff --git a/AI/config/trading.default.json b/AI/config/trading.default.json index ffe77ce2..cff0b134 100644 --- a/AI/config/trading.default.json +++ b/AI/config/trading.default.json @@ -1,6 +1,7 @@ { "pipeline": { "db_name": "db", + "account_code": "chart_based_signal_model", "default_mode": "simulation", "enable_xai": true, "data_start_date": "2023-01-01", diff --git a/AI/config/trading.py b/AI/config/trading.py index a90723ab..16163a6e 100644 --- a/AI/config/trading.py +++ b/AI/config/trading.py @@ -26,6 +26,7 @@ class MacroFallbackConfig: @dataclass(frozen=True, slots=True) class PipelineConfig: db_name: str + account_code: str default_mode: str enable_xai: bool data_start_date: str @@ -162,6 +163,7 @@ def _build_config(raw: dict[str, Any]) -> TradingConfig: config = TradingConfig( pipeline=PipelineConfig( db_name=raw["pipeline"]["db_name"], + account_code=str(raw["pipeline"].get("account_code", "chart_based_signal_model")).strip(), default_mode=raw["pipeline"]["default_mode"], enable_xai=raw["pipeline"]["enable_xai"], data_start_date=raw["pipeline"]["data_start_date"], @@ -212,6 +214,8 @@ def _build_config(raw: dict[str, Any]) -> TradingConfig: ) if config.pipeline.initial_capital <= 0: raise ValueError("pipeline.initial_capital must be greater than 0") + if not config.pipeline.account_code: + raise ValueError("pipeline.account_code must be a non-empty string") return config diff --git a/AI/libs/database/repository.py b/AI/libs/database/repository.py index 6dd31cb3..7c2acfb6 100644 --- a/AI/libs/database/repository.py +++ b/AI/libs/database/repository.py @@ -21,7 +21,7 @@ class PortfolioRepository: DB와의 상호작용(조회 및 저장)을 캡슐화하여 제공합니다. """ - def __init__(self, db_name: str = "db"): + def __init__(self, db_name: str = "db", account_code: Optional[str] = None): """ [초기화 메서드] 객체 생성 시 사용할 데이터베이스의 이름을 설정합니다. @@ -31,6 +31,9 @@ def __init__(self, db_name: str = "db"): 테스트 시에는 "test_db" 등으로 변경하여 주입(의존성 주입)할 수 있습니다. """ self.db_name = db_name + normalized_account_code = (account_code or "").strip() + self.account_code = normalized_account_code if normalized_account_code else None + self._account_id_cache: Optional[int] = None def get_latest_total_asset(self, target_date: str, default_asset: float = 100_000_000) -> float: """ @@ -75,6 +78,57 @@ def _get_connection(self): """ return get_db_conn(self.db_name) + def _resolve_account_id(self, conn=None) -> Optional[int]: + """ + Resolve account_id from account_code and cache it for reuse. + """ + if self.account_code is None: + return None + + if self._account_id_cache is not None: + return self._account_id_cache + + own_connection = conn is None + local_conn = conn or self._get_connection() + if local_conn is None: + raise RuntimeError( + f"[PortfolioRepository][Error] Failed to resolve account_id for account_code={self.account_code!r}: DB connection unavailable." + ) + + try: + with local_conn.cursor() as cursor: + cursor.execute( + """ + SELECT id, is_active + FROM public.account_names + WHERE account_code = %s + ORDER BY id ASC + """, + (self.account_code,), + ) + rows = cursor.fetchall() + finally: + if own_connection and local_conn: + local_conn.close() + + if not rows: + raise ValueError( + f"[PortfolioRepository][Error] account_code={self.account_code!r} not found in public.account_names." + ) + if len(rows) > 1: + raise ValueError( + f"[PortfolioRepository][Error] duplicate rows found for account_code={self.account_code!r}." + ) + + account_id, is_active = rows[0] + if not bool(is_active): + raise ValueError( + f"[PortfolioRepository][Error] account_code={self.account_code!r} is inactive in public.account_names." + ) + + self._account_id_cache = int(account_id) + return self._account_id_cache + def get_current_position(self, ticker: str, target_date: str = None, initial_cash: float = 10000) -> Dict[str, Any]: """ [현재 포지션 조회] @@ -102,6 +156,10 @@ def get_current_position(self, ticker: str, target_date: str = None, initial_cas WHERE ticker = %s """ params = [ticker] + account_id = self._resolve_account_id(conn) + if account_id is not None: + query += " AND account_id = %s " + params.append(account_id) # target_date가 주어지면 그 날짜 '이하'의 체결내역만 필터링하여 미래 데이터를 차단합니다. if target_date: @@ -182,19 +240,29 @@ def get_current_cash(self, target_date: str = None, initial_cash: float = 100000 try: with conn.cursor() as cursor: + account_id = self._resolve_account_id(conn) if target_date: exec_cash_query = """ SELECT cash_after FROM public.executions WHERE fill_date <= %s + """ + exec_cash_params: list[Any] = [target_date] + if account_id is not None: + exec_cash_query += " AND account_id = %s " + exec_cash_params.append(account_id) + exec_cash_query += """ ORDER BY fill_date DESC, created_at DESC, id DESC LIMIT 1 """ - cursor.execute(exec_cash_query, (target_date,)) + cursor.execute(exec_cash_query, tuple(exec_cash_params)) exec_cash = cursor.fetchone() if exec_cash and exec_cash[0] is not None: return float(exec_cash[0]) + if account_id is not None: + return float(initial_cash) + summary_cash_query = """ SELECT cash FROM public.portfolio_summary @@ -224,10 +292,17 @@ def get_open_tickers(self, target_date: str) -> List[str]: if conn is None: return [] + account_id = self._resolve_account_id(conn) query = """ SELECT ticker FROM public.executions WHERE fill_date <= %s + """ + query_params: list[Any] = [target_date] + if account_id is not None: + query += " AND account_id = %s " + query_params.append(account_id) + query += """ GROUP BY ticker HAVING SUM( CASE @@ -241,7 +316,7 @@ def get_open_tickers(self, target_date: str) -> List[str]: try: with conn.cursor() as cursor: - cursor.execute(query, (target_date,)) + cursor.execute(query, tuple(query_params)) rows = cursor.fetchall() return [str(row[0]) for row in rows] except Exception as e: @@ -266,27 +341,46 @@ def reset_run_data(self, run_id: str, target_date: Optional[str] = None) -> None try: with conn.cursor() as cursor: + account_id = self._resolve_account_id(conn) if target_date: # Safety-first default: only clear current run artifacts. # Global chain reset can remove unrelated simulations sharing the same DB. allow_global_chain_reset = os.environ.get("AI_ALLOW_GLOBAL_CHAIN_RESET", "0") == "1" if allow_global_chain_reset: - cursor.execute( - "DELETE FROM public.executions WHERE fill_date >= %s AND run_id LIKE 'daily_%%'", - (target_date,), - ) - cursor.execute( - "DELETE FROM public.xai_reports WHERE date >= %s AND run_id LIKE 'daily_%%'", - (target_date,), - ) - cursor.execute("DELETE FROM public.portfolio_positions WHERE date >= %s", (target_date,)) - cursor.execute("DELETE FROM public.portfolio_summary WHERE date >= %s", (target_date,)) + if account_id is None: + cursor.execute( + "DELETE FROM public.executions WHERE fill_date >= %s AND run_id LIKE 'daily_%%'", + (target_date,), + ) + cursor.execute( + "DELETE FROM public.xai_reports WHERE date >= %s AND run_id LIKE 'daily_%%'", + (target_date,), + ) + cursor.execute("DELETE FROM public.portfolio_positions WHERE date >= %s", (target_date,)) + cursor.execute("DELETE FROM public.portfolio_summary WHERE date >= %s", (target_date,)) + else: + cursor.execute( + "DELETE FROM public.executions WHERE fill_date >= %s AND run_id LIKE 'daily_%%' AND account_id = %s", + (target_date, account_id), + ) else: + if account_id is None: + cursor.execute("DELETE FROM public.executions WHERE run_id = %s", (run_id,)) + cursor.execute("DELETE FROM public.xai_reports WHERE run_id = %s", (run_id,)) + else: + cursor.execute( + "DELETE FROM public.executions WHERE run_id = %s AND account_id = %s", + (run_id, account_id), + ) + else: + if account_id is None: cursor.execute("DELETE FROM public.executions WHERE run_id = %s", (run_id,)) cursor.execute("DELETE FROM public.xai_reports WHERE run_id = %s", (run_id,)) - else: - cursor.execute("DELETE FROM public.executions WHERE run_id = %s", (run_id,)) - cursor.execute("DELETE FROM public.xai_reports WHERE run_id = %s", (run_id,)) + else: + cursor.execute( + "DELETE FROM public.executions WHERE run_id = %s AND account_id = %s", + (run_id, account_id), + ) conn.commit() if target_date: if os.environ.get("AI_ALLOW_GLOBAL_CHAIN_RESET", "0") == "1": @@ -362,6 +456,7 @@ def _normalize_run_id(value: Any) -> Optional[str]: cursor = conn.cursor() try: + resolved_account_id = self._resolve_account_id(conn) run_ids = sorted( { str(run_id).strip() @@ -370,11 +465,17 @@ def _normalize_run_id(value: Any) -> Optional[str]: } ) if run_ids: - cursor.execute("DELETE FROM public.executions WHERE run_id = ANY(%s)", (run_ids,)) + if resolved_account_id is None: + cursor.execute("DELETE FROM public.executions WHERE run_id = ANY(%s)", (run_ids,)) + else: + cursor.execute( + "DELETE FROM public.executions WHERE run_id = ANY(%s) AND account_id = %s", + (run_ids, resolved_account_id), + ) # 다량의 데이터를 빠르게 넣기 위한 INSERT 구문 준비 insert_query = """ INSERT INTO public.executions ( - run_id, xai_report_id, ticker, signal_date, signal_price, signal, + run_id, account_id, xai_report_id, ticker, signal_date, signal_price, signal, fill_date, fill_price, qty, side, value, commission, cash_after, position_qty, avg_price, pnl_realized, pnl_unrealized, created_at @@ -393,6 +494,7 @@ def _normalize_run_id(value: Any) -> Optional[str]: data_to_insert.append(( str(row["run_id"]), + resolved_account_id, xai_id, str(row["ticker"]), row["signal_date"], diff --git a/AI/pipelines/daily_routine.py b/AI/pipelines/daily_routine.py index 91ccd24f..c92f19ca 100644 --- a/AI/pipelines/daily_routine.py +++ b/AI/pipelines/daily_routine.py @@ -84,7 +84,10 @@ def run_daily_pipeline( run_id = f"daily_{exec_date_str}" print(f"\n[{exec_date_str}] === AI Daily Portfolio Routine (Mode: {mode.upper()}) ===") - repo = repo if repo is not None else PortfolioRepository(db_name=trading_config.pipeline.db_name) + repo = repo if repo is not None else PortfolioRepository( + db_name=trading_config.pipeline.db_name, + account_code=trading_config.pipeline.account_code, + ) if mode == "simulation" and hasattr(repo, "reset_run_data"): try: diff --git a/schema.sql b/schema.sql index c133403a..3e39d9b8 100644 --- a/schema.sql +++ b/schema.sql @@ -1,6 +1,9 @@ ---------------------------------------------------------------------- -- Schema: public --- - 퀀트 트레이딩, 백테스트, XAI 분석 및 포트폴리오 관리 시스템의 핵심 데이터 모델 +-- 퀀트 트레이딩, 백테스트, XAI 분석 및 포트폴리오 관리 시스템의 핵심 데이터 모델 +-- AI 스키마 현황입니다. 주요 테이블과 인덱스가 포함되어 있으며, 향후 필요에 따라 추가/수정될 수 있습니다. +-- 구조만 보여줄 뿐, 이 문서를 변경해도 db 자체는 변경되지 않습니다! +-- 스키마 변경시 최신화 바랍니다. ---------------------------------------------------------------------- -- 1. HDD 경로를 테이블스페이스로 등록 @@ -175,13 +178,49 @@ CREATE TABLE "xai_reports" ( ) TABLESPACE ts_ai_hdd; ---------------------------------------------------------------------- --- 9. executions +-- 9. account_names +-- - 파이프라인/계정 식별자 메타데이터 관리 +-- - executions.account_id 외래키 참조 기준 테이블 +---------------------------------------------------------------------- +CREATE TABLE IF NOT EXISTS "account_names" ( + "id" bigserial PRIMARY KEY, -- 계정 식별자 PK + "account_code" varchar(128) NOT NULL, -- 파이프라인에서 참조하는 안정 키 + "account_name" varchar(255) NOT NULL, -- 표시용 계정 이름 + "description" text, -- 선택 설명 + "is_active" boolean DEFAULT true NOT NULL, -- 활성 여부 + "created_at" timestamp with time zone DEFAULT now() NOT NULL, -- 생성 시각 + "updated_at" timestamp with time zone DEFAULT now() NOT NULL, -- 수정 시각 + CONSTRAINT "uq_account_names_account_code" UNIQUE ("account_code") +) TABLESPACE ts_ai_hdd; + +-- 기본 legacy 파이프라인 계정을 idempotent하게 등록 +INSERT INTO "account_names" ( + "account_code", + "account_name", + "description", + "is_active" +) VALUES ( + 'chart_based_signal_model', + '차트 기반 시그널 모델', + 'Chart-based signal model account.', + true +) +ON CONFLICT ("account_code") DO UPDATE +SET + "account_name" = EXCLUDED."account_name", + "description" = EXCLUDED."description", + "is_active" = true, + "updated_at" = now(); + +---------------------------------------------------------------------- +-- 10. executions -- - 실제 거래 또는 백테스트 시뮬레이션에서 발생한 개별 체결 이력 -- - 자산 추적 및 성과 평가를 위한 가장 세밀한 로그 데이터 ---------------------------------------------------------------------- CREATE TABLE "executions" ( "id" bigserial PRIMARY KEY, -- 체결 로그 ID "run_id" varchar(64), -- 백테스트/실행 회차 ID + "account_id" bigint, -- 체결 주체 계정 ID (account_names.id) "ticker" varchar(255) NOT NULL, -- 종목 티커 "signal_date" date NOT NULL, -- 전략 신호 발생일 "signal_price" numeric(38, 2), -- 전략 신호 당시 가격 @@ -199,11 +238,12 @@ CREATE TABLE "executions" ( "pnl_unrealized" numeric(38, 2) NOT NULL, -- 현재 시점 기준 미실현 손익 "created_at" timestamp with time zone DEFAULT now() NOT NULL, -- DB 기록 시각 "xai_report_id" bigint, -- 연관된 XAI 리포트 참조 ID + CONSTRAINT "fk_executions_account_names" FOREIGN KEY ("account_id") REFERENCES "account_names"("id") ON DELETE RESTRICT, CONSTRAINT "fk_executions_xai_reports" FOREIGN KEY ("xai_report_id") REFERENCES "xai_reports"("id") ON DELETE SET NULL ) TABLESPACE ts_ai_hdd; ---------------------------------------------------------------------- --- 10. portfolio_summary +-- 11. portfolio_summary -- - 전체 자산의 일별 성과 요약 (Equity Curve 생성용) ---------------------------------------------------------------------- CREATE TABLE "portfolio_summary" ( @@ -219,7 +259,7 @@ CREATE TABLE "portfolio_summary" ( ) TABLESPACE ts_ai_hdd; ---------------------------------------------------------------------- --- 11. portfolio_positions +-- 12. portfolio_positions -- 날짜별 종목 포지션 스냅샷을 저장 -- - portfolio_summary(일자별 총자산)와 함께 일별 상태 재현/검증 가능 ---------------------------------------------------------------------- @@ -237,7 +277,7 @@ CREATE TABLE "portfolio_positions" ( ) TABLESPACE ts_ai_hdd; ---------------------------------------------------------------------- --- 12. event_calendar +-- 13. event_calendar -- - 주요 경제 일정(FOMC, CPI, GDP) 및 기업 실적 발표일 저장 -- - AI 모델의 'Event' 피처(D-Day 계산 등)를 위한 원천 데이터 ---------------------------------------------------------------------- @@ -257,7 +297,7 @@ CREATE TABLE IF NOT EXISTS "event_calendar" ( ) TABLESPACE ts_ai_hdd; ---------------------------------------------------------------------- --- 13. sector_returns +-- 14. sector_returns -- - stock_info의 'sector'와 매핑되는 ETF의 일별 수익률 저장 -- - Wide Format이 아닌 Long Format (Date, Sector) 구조 ---------------------------------------------------------------------- @@ -280,6 +320,10 @@ CREATE INDEX IF NOT EXISTS "idx_crypto_price_data_ticker" ON "crypto_price_data" -- 2. executions: 특정 백테스트 회차의 체결 내역 모아보기 CREATE INDEX IF NOT EXISTS "idx_executions_run_id" ON "executions" ("run_id") TABLESPACE ts_ai_hdd; +-- 2-1. executions: 계정 단위 분리 조회/기간 조회 +CREATE INDEX IF NOT EXISTS "idx_executions_account_id" ON "executions" ("account_id") TABLESPACE ts_ai_hdd; +CREATE INDEX IF NOT EXISTS "idx_executions_account_id_fill_date" ON "executions" ("account_id", "fill_date") TABLESPACE ts_ai_hdd; + -- 3. executions: XAI 리포트 조인용 (외래키는 자동 인덱스 생성이 안 됨) CREATE INDEX IF NOT EXISTS "idx_executions_xai_report_id" ON "executions" ("xai_report_id") TABLESPACE ts_ai_hdd; @@ -293,4 +337,4 @@ CREATE INDEX IF NOT EXISTS "idx_portfolio_positions_date_ticker" ON "portfolio_p CREATE INDEX IF NOT EXISTS "idx_xai_reports_run_id" ON "xai_reports" ("run_id") TABLESPACE ts_ai_hdd; -- 7. company_fundamentals: 종목별 재무제표 데이터 조회용 (날짜 기준) -CREATE INDEX IF NOT EXISTS "idx_company_fundamentals_date" ON "company_fundamentals" ("date") TABLESPACE ts_ai_hdd; \ No newline at end of file +CREATE INDEX IF NOT EXISTS "idx_company_fundamentals_date" ON "company_fundamentals" ("date") TABLESPACE ts_ai_hdd;