apache · Appointat · Dec 29, 2025 · Dec 29, 2025 · Dec 30, 2025 · Jan 4, 2026
diff --git a/geaflow-ai/src/operator/casts/.gitignore b/geaflow-ai/src/operator/casts/.gitignore
@@ -0,0 +1,22 @@
+# Byte-compiled / optimized files
+__pycache__/
+*.py[cod]
+
+# Environment variables
+.env
+
+# Virtual environment
+.venv/
+uv.lock
+
+# Logs
+/logs/
+
+# IDE / OS specific
+.vscode/
+.DS_Store
+
+# Data files
+data/real_graph_data/
+casts_traversal_path_req_*.png
+*.md
diff --git a/geaflow-ai/src/operator/casts/casts/__init__.py b/geaflow-ai/src/operator/casts/casts/__init__.py
diff --git a/geaflow-ai/src/operator/casts/casts/core/__init__.py b/geaflow-ai/src/operator/casts/casts/core/__init__.py
diff --git a/geaflow-ai/src/operator/casts/casts/core/config.py b/geaflow-ai/src/operator/casts/casts/core/config.py
@@ -0,0 +1,210 @@
+"""Configuration management for CASTS system.
+
+Provides a clean abstraction over configuration sources (environment variables,
+config files, etc.) to eliminate hard-coded values.
+"""
+
+import os
+from typing import Any, Literal
+
+from dotenv import load_dotenv
+
+from casts.core.interfaces import Configuration
+
+# Load environment variables from .env file
+load_dotenv()
+
+
+class DefaultConfiguration(Configuration):
+    """Default configuration with hardcoded values for CASTS.
+
+    All configuration values are defined as class attributes for easy modification.
+    This eliminates the need for .env files while keeping configuration centralized.
+    """
+
+    # ============================================
+    # EMBEDDING SERVICE CONFIGURATION
+    # ============================================
+    EMBEDDING_ENDPOINT = os.environ.get("EMBEDDING_ENDPOINT", "")
+    EMBEDDING_APIKEY = os.environ.get("EMBEDDING_APIKEY", "YOUR_EMBEDDING_API_KEY_HERE")
+    # Default to a known embedding model to avoid requiring call-site defaults.
+    EMBEDDING_MODEL = os.environ.get("EMBEDDING_MODEL", "text-embedding-v3")
+
+    # ============================================
+    # LLM SERVICE CONFIGURATION
+    # ============================================
+    LLM_ENDPOINT = os.environ.get("LLM_ENDPOINT", "")
+    LLM_APIKEY = os.environ.get("LLM_APIKEY", "YOUR_LLM_API_KEY_HERE")
+    LLM_MODEL = os.environ.get("LLM_MODEL", "")
+
+    # ============================================
+    # SIMULATION CONFIGURATION
+    # ============================================
+    SIMULATION_GRAPH_SIZE = 40  # For synthetic data: the number of nodes in the generated graph.
+    SIMULATION_NUM_EPOCHS = 5  # Number of simulation epochs to run.
+    SIMULATION_MAX_DEPTH = 5  # Max traversal depth for a single path.
+    SIMULATION_USE_REAL_DATA = (
+        True  # If True, use real data from CSVs; otherwise, generate synthetic data.
+    )
+    SIMULATION_REAL_DATA_DIR = (
+        "data/real_graph_data"  # Directory containing the real graph data CSV files.
+    )
+    SIMULATION_REAL_SUBGRAPH_SIZE = 200  # Max number of nodes to sample for the real data subgraph.
+    SIMULATION_ENABLE_VERIFIER = True  # If True, enables the LLM-based path evaluator.
+    SIMULATION_ENABLE_VISUALIZER = False  # If True, generates visualizations of simulation results.
+    SIMULATION_VERBOSE_LOGGING = True  # If True, prints detailed step-by-step simulation logs.
+    SIMULATION_MIN_STARTING_DEGREE = (
+        2  # Minimum outgoing degree for starting nodes (Tier 2 fallback).
+    )
+    SIMULATION_MAX_RECOMMENDED_NODE_TYPES = (
+        3  # Max node types LLM can recommend for starting nodes.
+    )
+
+    # ============================================
+    # DATA CONFIGURATION
+    # ============================================
+    # Special-case mapping for edge data files that do not follow the standard naming convention.
+    # Used for connectivity enhancement in RealDataSource.
+    EDGE_FILENAME_MAPPING_SPECIAL_CASES = {
+        "transfer": "AccountTransferAccount.csv",
+        "own_person": "PersonOwnAccount.csv",
+        "own_company": "CompanyOwnAccount.csv",
+        "signin": "MediumSignInAccount.csv",
+    }
+
+    # ============================================
+    # CACHE CONFIGURATION
+    # Mathematical model alignment: See 数学建模.md Section 4.6.2 for formula derivation
+    # ============================================
+
+    # Minimum confidence score for a Tier-1 (exact) match to be considered.
+    CACHE_MIN_CONFIDENCE_THRESHOLD = 2.0
+
+    # Multiplier for Tier-2 (similarity) confidence threshold.
+    # Formula: tier2_threshold = TIER1_THRESHOLD * TIER2_GAMMA (where γ > 1)
+    # Higher values require higher confidence for Tier-2 matching.
+    CACHE_TIER2_GAMMA = 1.2
+
+    # Kappa (κ): Base threshold parameter.
+    # Formula: δ_sim(v) = 1 - κ / (σ_logic(v) · (1 + β · log(η(v))))
+    #
+    # CRITICAL: Counter-intuitive behavior!
+    # - Higher κ → LOWER threshold → MORE permissive matching (easier to match)
+    # - Lower κ → HIGHER threshold → MORE strict matching (harder to match)
+    #
+    # This is because δ = 1 - κ/(...):
+    #   κ↑ → κ/(...)↑ → 1 - (large)↓ → threshold decreases
+    #
+    # Mathematical model (数学建模.md line 983-985) uses κ=0.01 which produces
+    # very HIGH thresholds (~0.99), requiring near-perfect similarity.
+    #
+    # For early-stage exploration with suboptimal embeddings, use HIGHER κ values:
+    #   κ=0.25: threshold ~0.78-0.89 for typical SKUs (original problematic value)
+    #   κ=0.30: threshold ~0.73-0.86 for typical SKUs (more permissive)
+    #   κ=0.40: threshold ~0.64-0.82 for typical SKUs (very permissive)
+    #
+    # Current setting balances exploration and safety for similarity ~0.83
+    CACHE_SIMILARITY_KAPPA = 0.30
+
+    # Beta (β): Frequency sensitivity parameter.
+    # Controls how much a SKU's confidence score (η) affects its similarity threshold.
+    # Higher beta → high-confidence (frequent) SKUs require stricter matching
+    #   (threshold closer to 1).
+    # Lower beta → reduces the difference between high-frequency and low-frequency
+    #   SKU thresholds.
+    # Interpretation: β adjusts "热度敏感性" (frequency sensitivity).
+    # Recommended range: 0.05-0.2 (see 数学建模.md line 959, 983-985)
+    # Using β=0.05 for gentler frequency-based threshold adjustment.
+    CACHE_SIMILARITY_BETA = 0.05
+    # Fingerprint for the current graph schema. Changing this will invalidate all existing SKUs.
+    CACHE_SCHEMA_FINGERPRINT = "schema_v1"
+
+    # SIGNATURE CONFIGURATION
+    # Signature abstraction level, used as a MATCHING STRATEGY at runtime.
+    # SKUs are always stored in their canonical, most detailed (Level 2) format.
+    #   0 = Abstract (out/in/both only)
+    #   1 = Edge-aware (out('friend'))
+    #   2 = Full path (including filters like has())
+    SIGNATURE_LEVEL = 2
+
+    # Optional: Whitelist of edge labels to track (None = track all).
+    # Only applicable if SIGNATURE_LEVEL >= 1.
+    SIGNATURE_EDGE_WHITELIST = None
+
+    # ============================================
+    # CYCLE DETECTION & PENALTY CONFIGURATION
+    # ============================================
+    # CYCLE_PENALTY modes: "NONE" (no validation), "PUNISH" (penalize but continue),
+    # "STOP" (terminate path)
+    CYCLE_PENALTY: Literal["NONE", "PUNISH", "STOP"] = "STOP"
+    CYCLE_DETECTION_THRESHOLD = 0.7
+    MIN_EXECUTION_CONFIDENCE = 0.1
+    POSTCHECK_MIN_EVIDENCE = 3
+
+    def get(self, key: str, default: Any = None) -> Any:
+        """Get configuration value by key."""
+        # Support legacy/alias key names used in the codebase.
+        alias_map = {
+            "EMBEDDING_MODEL_NAME": self.EMBEDDING_MODEL,
+            "LLM_MODEL_NAME": self.LLM_MODEL,
+        }
+        if key in alias_map:
+            return alias_map[key]
+
+        # Prefer direct attribute access to avoid duplicated defaults at call sites.
+        return getattr(self, key, default)
+
+    def get_int(self, key: str, default: int = 0) -> int:
+        """Get integer configuration value."""
+        return int(self.get(key, default))
+
+    def get_float(self, key: str, default: float = 0.0) -> float:
+        """Get float configuration value."""
+        return float(self.get(key, default))
+
+    def get_bool(self, key: str, default: bool = False) -> bool:
+        """Get boolean configuration value."""
+        return bool(self.get(key, default))
+
+    def get_str(self, key: str, default: str = "") -> str:
+        """Get string configuration value."""
+        return str(self.get(key, default))
+
+    def get_embedding_config(self) -> dict[str, str]:
+        """Get embedding service configuration."""
+        return {
+            "endpoint": self.EMBEDDING_ENDPOINT,
+            "api_key": self.EMBEDDING_APIKEY,
+            "model": self.EMBEDDING_MODEL,
+        }
+
+    def get_llm_config(self) -> dict[str, str]:
+        """Get LLM service configuration."""
+        return {
+            "endpoint": self.LLM_ENDPOINT,
+            "api_key": self.LLM_APIKEY,
+            "model": self.LLM_MODEL,
+        }
+
+    def get_simulation_config(self) -> dict[str, Any]:
+        """Get simulation configuration."""
+        return {
+            "graph_size": self.SIMULATION_GRAPH_SIZE,
+            "num_epochs": self.SIMULATION_NUM_EPOCHS,
+            "max_depth": self.SIMULATION_MAX_DEPTH,
+            "use_real_data": self.SIMULATION_USE_REAL_DATA,
+            "real_data_dir": self.SIMULATION_REAL_DATA_DIR,
+            "real_subgraph_size": self.SIMULATION_REAL_SUBGRAPH_SIZE,
+            "enable_verifier": self.SIMULATION_ENABLE_VERIFIER,
+            "enable_visualizer": self.SIMULATION_ENABLE_VISUALIZER,
+        }
+
+    def get_cache_config(self) -> dict[str, Any]:
+        """Get cache configuration."""
+        return {
+            "min_confidence_threshold": self.CACHE_MIN_CONFIDENCE_THRESHOLD,
+            "tier2_gamma": self.CACHE_TIER2_GAMMA,
+            "similarity_kappa": self.CACHE_SIMILARITY_KAPPA,
+            "similarity_beta": self.CACHE_SIMILARITY_BETA,
+            "schema_fingerprint": self.CACHE_SCHEMA_FINGERPRINT,
+        }