Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
536ea57
feat: add CASTS for LLM-Graph based reasoning
Appointat Dec 29, 2025
40cc3ba
feat: enhance simulation evaluation with metadata and improve configu…
Appointat Dec 29, 2025
9d4ef40
feat: enhance LLM Oracle and Simulation Engine with Debug Logging and…
Appointat Dec 30, 2025
2a685f0
feat(reasoning): implement canonical storage with abstract matching f…
Appointat Jan 4, 2026
ac6b49e
chore: update type hints to use List and improve code formatting acro…
Appointat Jan 4, 2026
b62e524
feat: enhance LLM Oracle with starting node type recommendations
Appointat Jan 7, 2026
9b2f976
feat: implement simplePath() cycle prevention with LLM-driven path qu…
Appointat Jan 9, 2026
a48cd40
feat(metrics): add rollback_steps method to MetricsCollector
Appointat Jan 19, 2026
e1aaf2f
Merge branch 'apache:master' into master
Appointat Feb 2, 2026
ef4510d
refactor: refactor metrics handling and evaluation logic in CASTS sim…
Appointat Feb 2, 2026
2472786
refactor: refactor code structure for improved readability and mainta…
Appointat Feb 3, 2026
e534be4
refactor: move CASTS into geaflow-ai operator
Appointat Feb 4, 2026
569f319
reafactor: refactor type hints across multiple modules to use built-i…
Appointat Feb 4, 2026
53d4457
refactor: update type hints for GremlinState and PathEvaluator for im…
Appointat Feb 4, 2026
f800300
refactor: update imports to use StrategyCache from strategy_cache module
Appointat Feb 4, 2026
e9c94d1
refactor: update module documentation to improve clarity and consistency
Appointat Feb 4, 2026
ac210cc
Merge branch 'apache:master' into master
Appointat Feb 9, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions geaflow-ai/src/operator/casts/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# Byte-compiled / optimized files
__pycache__/
*.py[cod]

# Environment variables
.env

# Virtual environment
.venv/
uv.lock

# Logs
/logs/

# IDE / OS specific
.vscode/
.DS_Store

# Data files
data/real_graph_data/
casts_traversal_path_req_*.png
*.md
Empty file.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

All files in the project must start with an Apache License header, even if they are blank.

Empty file.
210 changes: 210 additions & 0 deletions geaflow-ai/src/operator/casts/casts/core/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
"""Configuration management for CASTS system.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's better not to mix with Java code; place CASTS in a dedicated folder, such as geaflow-ai/casts, and why does casts have two nested layers with the same name?


Provides a clean abstraction over configuration sources (environment variables,
config files, etc.) to eliminate hard-coded values.
"""

import os
from typing import Any, Literal

from dotenv import load_dotenv

from casts.core.interfaces import Configuration

# Load environment variables from .env file
load_dotenv()


class DefaultConfiguration(Configuration):
"""Default configuration with hardcoded values for CASTS.

All configuration values are defined as class attributes for easy modification.
This eliminates the need for .env files while keeping configuration centralized.
"""

# ============================================
# EMBEDDING SERVICE CONFIGURATION
# ============================================
EMBEDDING_ENDPOINT = os.environ.get("EMBEDDING_ENDPOINT", "")
EMBEDDING_APIKEY = os.environ.get("EMBEDDING_APIKEY", "YOUR_EMBEDDING_API_KEY_HERE")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's recommended to remove default values and provide explicit error messages when required values are empty.

# Default to a known embedding model to avoid requiring call-site defaults.
EMBEDDING_MODEL = os.environ.get("EMBEDDING_MODEL", "text-embedding-v3")

# ============================================
# LLM SERVICE CONFIGURATION
# ============================================
LLM_ENDPOINT = os.environ.get("LLM_ENDPOINT", "")
LLM_APIKEY = os.environ.get("LLM_APIKEY", "YOUR_LLM_API_KEY_HERE")
LLM_MODEL = os.environ.get("LLM_MODEL", "")

# ============================================
# SIMULATION CONFIGURATION
# ============================================
SIMULATION_GRAPH_SIZE = 40 # For synthetic data: the number of nodes in the generated graph.
SIMULATION_NUM_EPOCHS = 5 # Number of simulation epochs to run.
SIMULATION_MAX_DEPTH = 5 # Max traversal depth for a single path.
SIMULATION_USE_REAL_DATA = (
True # If True, use real data from CSVs; otherwise, generate synthetic data.
)
SIMULATION_REAL_DATA_DIR = (
"data/real_graph_data" # Directory containing the real graph data CSV files.
)
SIMULATION_REAL_SUBGRAPH_SIZE = 200 # Max number of nodes to sample for the real data subgraph.
SIMULATION_ENABLE_VERIFIER = True # If True, enables the LLM-based path evaluator.
SIMULATION_ENABLE_VISUALIZER = False # If True, generates visualizations of simulation results.
SIMULATION_VERBOSE_LOGGING = True # If True, prints detailed step-by-step simulation logs.
SIMULATION_MIN_STARTING_DEGREE = (
2 # Minimum outgoing degree for starting nodes (Tier 2 fallback).
)
SIMULATION_MAX_RECOMMENDED_NODE_TYPES = (
3 # Max node types LLM can recommend for starting nodes.
)

# ============================================
# DATA CONFIGURATION
# ============================================
# Special-case mapping for edge data files that do not follow the standard naming convention.
# Used for connectivity enhancement in RealDataSource.
EDGE_FILENAME_MAPPING_SPECIAL_CASES = {
"transfer": "AccountTransferAccount.csv",
"own_person": "PersonOwnAccount.csv",
"own_company": "CompanyOwnAccount.csv",
"signin": "MediumSignInAccount.csv",
}

# ============================================
# CACHE CONFIGURATION
# Mathematical model alignment: See 数学建模.md Section 4.6.2 for formula derivation
# ============================================

# Minimum confidence score for a Tier-1 (exact) match to be considered.
CACHE_MIN_CONFIDENCE_THRESHOLD = 2.0

# Multiplier for Tier-2 (similarity) confidence threshold.
# Formula: tier2_threshold = TIER1_THRESHOLD * TIER2_GAMMA (where γ > 1)
# Higher values require higher confidence for Tier-2 matching.
CACHE_TIER2_GAMMA = 1.2

# Kappa (κ): Base threshold parameter.
# Formula: δ_sim(v) = 1 - κ / (σ_logic(v) · (1 + β · log(η(v))))
#
# CRITICAL: Counter-intuitive behavior!
# - Higher κ → LOWER threshold → MORE permissive matching (easier to match)
# - Lower κ → HIGHER threshold → MORE strict matching (harder to match)
#
# This is because δ = 1 - κ/(...):
# κ↑ → κ/(...)↑ → 1 - (large)↓ → threshold decreases
#
# Mathematical model (数学建模.md line 983-985) uses κ=0.01 which produces
# very HIGH thresholds (~0.99), requiring near-perfect similarity.
#
# For early-stage exploration with suboptimal embeddings, use HIGHER κ values:
# κ=0.25: threshold ~0.78-0.89 for typical SKUs (original problematic value)
# κ=0.30: threshold ~0.73-0.86 for typical SKUs (more permissive)
# κ=0.40: threshold ~0.64-0.82 for typical SKUs (very permissive)
#
# Current setting balances exploration and safety for similarity ~0.83
CACHE_SIMILARITY_KAPPA = 0.30

# Beta (β): Frequency sensitivity parameter.
# Controls how much a SKU's confidence score (η) affects its similarity threshold.
# Higher beta → high-confidence (frequent) SKUs require stricter matching
# (threshold closer to 1).
# Lower beta → reduces the difference between high-frequency and low-frequency
# SKU thresholds.
# Interpretation: β adjusts "热度敏感性" (frequency sensitivity).
# Recommended range: 0.05-0.2 (see 数学建模.md line 959, 983-985)
# Using β=0.05 for gentler frequency-based threshold adjustment.
CACHE_SIMILARITY_BETA = 0.05
# Fingerprint for the current graph schema. Changing this will invalidate all existing SKUs.
CACHE_SCHEMA_FINGERPRINT = "schema_v1"

# SIGNATURE CONFIGURATION
# Signature abstraction level, used as a MATCHING STRATEGY at runtime.
# SKUs are always stored in their canonical, most detailed (Level 2) format.
# 0 = Abstract (out/in/both only)
# 1 = Edge-aware (out('friend'))
# 2 = Full path (including filters like has())
SIGNATURE_LEVEL = 2

# Optional: Whitelist of edge labels to track (None = track all).
# Only applicable if SIGNATURE_LEVEL >= 1.
SIGNATURE_EDGE_WHITELIST = None

# ============================================
# CYCLE DETECTION & PENALTY CONFIGURATION
# ============================================
# CYCLE_PENALTY modes: "NONE" (no validation), "PUNISH" (penalize but continue),
# "STOP" (terminate path)
CYCLE_PENALTY: Literal["NONE", "PUNISH", "STOP"] = "STOP"
CYCLE_DETECTION_THRESHOLD = 0.7
MIN_EXECUTION_CONFIDENCE = 0.1
POSTCHECK_MIN_EVIDENCE = 3

def get(self, key: str, default: Any = None) -> Any:
"""Get configuration value by key."""
# Support legacy/alias key names used in the codebase.
alias_map = {
"EMBEDDING_MODEL_NAME": self.EMBEDDING_MODEL,
"LLM_MODEL_NAME": self.LLM_MODEL,
}
if key in alias_map:
return alias_map[key]

# Prefer direct attribute access to avoid duplicated defaults at call sites.
return getattr(self, key, default)

def get_int(self, key: str, default: int = 0) -> int:
"""Get integer configuration value."""
return int(self.get(key, default))

def get_float(self, key: str, default: float = 0.0) -> float:
"""Get float configuration value."""
return float(self.get(key, default))

def get_bool(self, key: str, default: bool = False) -> bool:
"""Get boolean configuration value."""
return bool(self.get(key, default))

def get_str(self, key: str, default: str = "") -> str:
"""Get string configuration value."""
return str(self.get(key, default))

def get_embedding_config(self) -> dict[str, str]:
"""Get embedding service configuration."""
return {
"endpoint": self.EMBEDDING_ENDPOINT,
"api_key": self.EMBEDDING_APIKEY,
"model": self.EMBEDDING_MODEL,
}

def get_llm_config(self) -> dict[str, str]:
"""Get LLM service configuration."""
return {
"endpoint": self.LLM_ENDPOINT,
"api_key": self.LLM_APIKEY,
"model": self.LLM_MODEL,
}

def get_simulation_config(self) -> dict[str, Any]:
"""Get simulation configuration."""
return {
"graph_size": self.SIMULATION_GRAPH_SIZE,
"num_epochs": self.SIMULATION_NUM_EPOCHS,
"max_depth": self.SIMULATION_MAX_DEPTH,
"use_real_data": self.SIMULATION_USE_REAL_DATA,
"real_data_dir": self.SIMULATION_REAL_DATA_DIR,
"real_subgraph_size": self.SIMULATION_REAL_SUBGRAPH_SIZE,
"enable_verifier": self.SIMULATION_ENABLE_VERIFIER,
"enable_visualizer": self.SIMULATION_ENABLE_VISUALIZER,
}

def get_cache_config(self) -> dict[str, Any]:
"""Get cache configuration."""
return {
"min_confidence_threshold": self.CACHE_MIN_CONFIDENCE_THRESHOLD,
"tier2_gamma": self.CACHE_TIER2_GAMMA,
"similarity_kappa": self.CACHE_SIMILARITY_KAPPA,
"similarity_beta": self.CACHE_SIMILARITY_BETA,
"schema_fingerprint": self.CACHE_SCHEMA_FINGERPRINT,
}
Loading