Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
149aab3
Initial plan
Copilot Mar 17, 2026
8c6f052
🧹 Refactor: Remove bare `except Exception:` suppressing errors silent…
bashandbone Mar 16, 2026
7af95d9
chore: remove deprecated `combined_lifespan` alias (#226)
bashandbone Mar 16, 2026
912ddb1
chore: update dependencies
bashandbone Mar 16, 2026
c6be465
Fix mteb_to_codeweaver.py: undefined names, broken init block, wrong …
Copilot Mar 16, 2026
e3aab2b
test: update CircuitBreakerState to use .variable (#228)
bashandbone Mar 16, 2026
ad05881
⚑ perf: Optimize Provider membership check using tuple (#218)
bashandbone Mar 16, 2026
1328896
⚑ perf: optimize membership check in _check_profile using set (#220)
bashandbone Mar 16, 2026
b130666
fix: namespace conflict preventing initialization when Fastembed prov…
bashandbone Mar 16, 2026
ac238fa
πŸ§ͺ Add unit tests for DiscoveredFile absolute_path property (#215)
bashandbone Mar 16, 2026
e5f261e
⚑ Optimize list membership check to set for HTML tags (#216)
bashandbone Mar 16, 2026
0f6a241
fix: feature gating for duck duck go
bashandbone Mar 16, 2026
705b9a3
Fix mock_provider_lazy configuration in integration tests (#232)
bashandbone Mar 16, 2026
ff38668
perf(cli): optimize membership check in index command (#234)
bashandbone Mar 16, 2026
f2be6e9
πŸ§ͺ test: add coverage for force shutdown handler (#217)
bashandbone Mar 16, 2026
5624c92
fix: integration test failures
bashandbone Mar 16, 2026
040b675
πŸ§ͺ test: Add unit tests for get_version fallback mechanisms (#219)
bashandbone Mar 16, 2026
ec64851
πŸ”’ Replace insecure pickle with JSON for node types cache (#233)
bashandbone Mar 16, 2026
e46c760
fix: uuid7 generator resolution
bashandbone Mar 16, 2026
3a92806
fix: correct uuid7 timestamp handling
bashandbone Mar 16, 2026
f45a9a2
fix: Remove "has_package() is None" checks; has_package returns a boo…
bashandbone Mar 16, 2026
e9cef69
fix: test_init tests bypassing themselves
bashandbone Mar 16, 2026
c5db9dd
Initial plan
Copilot Mar 17, 2026
bda57af
πŸ”’ fix: security hardening for DI container eval - remove type builtin…
Copilot Mar 17, 2026
ede3ade
πŸ”’ fix: security hardening for DI container eval - remove type builtin…
Copilot Mar 17, 2026
56dfd3e
πŸ”’ fix: security hardening for DI container eval - remove type builtin…
Copilot Mar 17, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions .github/workflows/claude.yml
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,8 @@ jobs:
with:
claude_code_oauth_token: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }}
github_token: ${{ secrets.GITHUB_TOKEN }}
allowed_non_write_users: Copilot
allowed_bots: "github-actions[bot],copilot[bot],dependabot[bot],copilot,github-actions,gemini[bot],claude[bot]"
allowed_non_write_users: Copilot,copilot,jules[bot],jules
allowed_bots: "github-actions[bot],copilot[bot],dependabot[bot],copilot,github-actions,gemini[bot],claude[bot],jules[bot]"
trigger_phrase: "@claude"
assignee_trigger: claude[bot]
label_trigger: claude
Expand Down Expand Up @@ -105,6 +105,8 @@ jobs:
with:
claude_code_oauth_token: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }}
github_token: ${{ secrets.GITHUB_TOKEN }}
allowed_non_write_users: Copilot,copilot,jules[bot],jules
allowed_bots: "github-actions[bot],copilot[bot],dependabot[bot],copilot,github-actions,gemini[bot],claude[bot],jules[bot]"
trigger_phrase: "@claude"
assignee_trigger: claude
label_trigger: claude
Expand Down Expand Up @@ -140,6 +142,8 @@ jobs:
with:
claude_code_oauth_token: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }}
github_token: ${{ secrets.GITHUB_TOKEN }}
allowed_non_write_users: Copilot,copilot,jules[bot],jules
allowed_bots: "github-actions[bot],copilot[bot],dependabot[bot],copilot,github-actions,gemini[bot],claude[bot],jules[bot]"
trigger_phrase: "@claude"
assignee_trigger: claude
label_trigger: claude
Expand Down Expand Up @@ -177,6 +181,8 @@ jobs:
with:
claude_code_oauth_token: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }}
github_token: ${{ secrets.GITHUB_TOKEN }}
allowed_non_write_users: Copilot,copilot,jules[bot],jules
allowed_bots: "github-actions[bot],copilot[bot],dependabot[bot],copilot,github-actions,gemini[bot],claude[bot],jules[bot]"
trigger_phrase: "@claude"
assignee_trigger: claude
label_trigger: claude
Expand Down
4 changes: 2 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -229,5 +229,5 @@ test-results.xml
mise.local.toml
mise.local.env

.gemini/
gha-creds-*.json
.exportify/
!.exportify/config.toml
2 changes: 1 addition & 1 deletion docs/plans/2026-02-24-provider-config-consolidation.md
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,7 @@ def _get_default_reranking_settings() -> DeterminedDefaults:
Priority: Voyage (cloud, auth required) > FastEmbed > SentenceTransformers.
"""
for lib in ("voyageai", "fastembed_gpu", "fastembed", "sentence_transformers"):
if has_package(lib) is not None:
if has_package(lib):
if lib == "voyageai" and Provider.VOYAGE.has_env_auth:
return DeterminedDefaults(
provider=Provider.VOYAGE,
Expand Down
2 changes: 1 addition & 1 deletion hk.pkl
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ local linters = new Mapping<String, Step> {
}
["check-added-large-files"] = new Step {
glob = "*"
check = "hk util check-added-large-files --maxkb 10000 {{ files }}"
check = "hk util check-added-large-files --maxkb 15000 {{ files }}"
}
["check-merge-conflict-markers"] = new Step {
glob = "*"
Expand Down
6 changes: 3 additions & 3 deletions mise.dev.toml
Original file line number Diff line number Diff line change
Expand Up @@ -275,8 +275,8 @@ uv run python scripts/build/prepare-build.py || {

# Step 2: Check for generated artifacts
echo -e "${CW_PREFIX} Step 2/7: Verifying generated artifacts..."
if [ ! -f "src/codeweaver/data/node_types_cache.pkl" ]; then
echo -e "${YELLOW}Warning: node_types_cache.pkl not found${NC}"
if [ ! -f "src/codeweaver/semantic/data/node_types_cache.json" ]; then
echo -e "${YELLOW}Warning: node_types_cache.json not found${NC}"
fi

# Step 3: Update CHANGELOG
Expand All @@ -302,7 +302,7 @@ reuse spdx --add-license-concluded --creator-person "automated[bot]" --creator-o
# Step 6: Commit if not disabled
if [ -z "${usage_no_commit:-}" ]; then
echo -e "${CW_PREFIX} Step 6/7: Committing build artifacts..."
git add src/codeweaver/data/node_types_cache.pkl 2>/dev/null || true
git add src/codeweaver/semantic/data/node_types_cache.json 2>/dev/null || true
git add CHANGELOG.md 2>/dev/null || true
git add schema/ 2>/dev/null || true

Expand Down
1 change: 1 addition & 0 deletions mise.toml
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ ast-grep = "latest"
python = '''{{ get_env(name="MISE_PYTHON_VERSION", default="3.13") }}'''
uv = "latest"
"pipx:exportify" = "0.2.5"
hk = "1.38.0"

# Quick note for those unfamiliar with mise:
# - despite the namespace, tools prefixed with "pipx:" are installed via "uv" (I assume once upon a time they were installed with pipx before uv came along)
Expand Down
20 changes: 10 additions & 10 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ dependencies = [
# So we pin it to make sure we don't break on minor releases
"pydantic==2.12.5",
# for, you know, platform dirs
"platformdirs>=4.9.2",
"platformdirs>=4.9.4",
# psutil used for resource governance/limiting by engine
"psutil>=7.2.2",
"textcase>=0.4.5",
Expand All @@ -140,20 +140,20 @@ dependencies = [
# for local providers (sentence-transformers, fastembed) to detect CPU/GPU features
"py-cpuinfo>=9.0.0",
# * ================ CLI Dependencies ==================*
"cyclopts>=4.5.1",
"rich>=14.3.0",
"cyclopts>=4.10.0",
"rich>=14.3.3",
# * ================ Provider Clients ==================*
# we must pin these to specific versions to ensure compatibility with our ClientOptions subclasses
"boto3==1.42.19",
"cohere==5.20.1",
"cohere==5.20.7",
"fastembed==0.7.4; python_version < '3.14'",
"google-genai==1.56.0",
# NOTE: We're waiting on pydantic-ai to update to 1.0+ before we can upgrade too
"huggingface-hub==0.36.2",
"huggingface-hub>=1.7.1",
"mistralai==1.10.0",
"openai==2.17.0",
"qdrant-client==1.16.2",
"pydantic-ai-slim>=1.56.0",
"openai==2.28.0",
"qdrant-client==1.17.1",
"pydantic-ai-slim>=1.68.0",
"sentence-transformers==5.2.0; python_version <= '3.14'",
"voyageai==0.3.7",
# * ================ Indexing and Engine ==================*
Expand All @@ -171,12 +171,12 @@ dependencies = [
# fastmcp is the core MCP server framework
"fastmcp>=2.14.5",
# just used for types but we need them at runtime for Pydantic models
"mcp>=1.19.0",
"mcp>=1.23.3",
# Runs the core admin/management server
"uvicorn[standard]>=0.40.0",
# * ================ Configuration and Settings ==================*
# pydantic-settings with toml and yaml support for config files
"pydantic-settings[toml,yaml]>=2.12.0", # Pulls: tomli>=2.0.1, pyyaml>=6.0.1
"pydantic-settings[toml,yaml]>=2.13.1", # Pulls: tomli>=2.0.1, pyyaml>=6.0.1
# For writing toml config files
"tomli-w>=1.2.0",
# * ================ Telemetry and Observability ==================*
Expand Down
2 changes: 1 addition & 1 deletion scripts/build/generate-mcp-server-json.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,9 @@
ConfigLanguage,
EnvFormat,
Provider,
ProviderCategory,
ProviderEnvVarInfo,
ProviderEnvVars,
ProviderCategory,
SemanticSearchLanguage,
)

Expand Down
11 changes: 6 additions & 5 deletions scripts/build/preprocess-node-types.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,17 @@

This script loads all tree-sitter node-types.json files, parses them into
CodeWeaver's internal Thing/Category representation, and serializes the
result to a pickle cache. This cache is loaded at runtime for fast startup.
result to a JSON cache. This cache is loaded at runtime for fast startup.
"""

from __future__ import annotations

import pickle
import sys

from pathlib import Path

from pydantic_core import to_json


def main() -> int:
"""Preprocess node types and generate cache file."""
Expand All @@ -43,13 +44,13 @@ def main() -> int:
"registration_cache": parser.registration_cache,
}

# Write cache file
# Write cache file (no indentation to keep file size under version control limits)
cache_file = repo_root / "src" / "codeweaver" / \
"semantic" / "data" / "node_types_cache.pkl"
"semantic" / "data" / "node_types_cache.json"
print(f"Writing cache to {cache_file}...")

with cache_file.open("wb") as f:
pickle.dump(cache_data, f, protocol=pickle.HIGHEST_PROTOCOL)
f.write(to_json(cache_data))

cache_size = cache_file.stat().st_size
print(f"βœ“ Generated node_types cache: {cache_file}")
Expand Down
5 changes: 4 additions & 1 deletion scripts/model_data/hf-models.json
100755 β†’ 100644
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,7 @@
]
}
},
"models": {},
"models": {
"Alibaba-NLP/gte-modernbert-base": {
"adapted_from": null,
Expand Down Expand Up @@ -3924,4 +3925,6 @@
"opensearch-project/opensearch-neural-sparse-encoding-doc-v2-mini"
]
}
}
{
"models": {}
}
Comment on lines +3928 to +3930
Empty file modified scripts/model_data/hf-models.json.license
100755 β†’ 100644
Empty file.
45 changes: 21 additions & 24 deletions scripts/model_data/mteb_to_codeweaver.py
100755 β†’ 100644
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,11 @@


# make sure codeweaver is importable
sys.path.insert(0, str(Path(__file__).parent.parent))
sys.path.insert(0, str(Path(__file__).parent.parent.parent / "src"))

from codeweaver.core import Provider
from codeweaver.providers import PartialCapabilities
from codeweaver.providers.provider import Provider

from codeweaver.providers.embedding.capabilities.types import PartialCapabilities


# TODO: Finish refactor to use these inline constants and eliminate the hf-models.json
Expand Down Expand Up @@ -99,6 +100,13 @@
Note: FastEmbed also has some aliases, but we handle those dynamically below.
"""

KNOWN_ALIASES: dict[str, dict[ModelName, ModelName]] = {"ollama": OLLAMA_ALIASES}
"""A mapping of provider names to their HF name β†’ provider alias mappings.

Keys are provider name strings (e.g. "ollama") and values are dicts mapping HF model names
to the provider-specific alias. FastEmbed aliases are handled dynamically via get_fastembed_aliases().
"""

KNOWN_SPARSE_MODELS = {
Provider.FASTEMBED: [
"Qdrant/bm25",
Expand Down Expand Up @@ -364,13 +372,13 @@ def attempt_to_get_version(name: str) -> str | int | float | None:
"Snowflake",
]
type HFModelProviders = Literal[
Provider.FASTEMBED,
Provider.HUGGINGFACE_INFERENCE,
Provider.FIREWORKS,
Provider.GROQ,
Provider.OLLAMA,
Provider.SENTENCE_TRANSFORMERS,
Provider.TOGETHER,
Provider.FASTEMBED, # ty: ignore[invalid-type-form]
Provider.HUGGINGFACE_INFERENCE, # ty: ignore[invalid-type-form]
Provider.FIREWORKS, # ty: ignore[invalid-type-form]
Provider.GROQ, # ty: ignore[invalid-type-form]
Provider.OLLAMA, # ty: ignore[invalid-type-form]
Provider.SENTENCE_TRANSFORMERS, # ty: ignore[invalid-type-form]
Provider.TOGETHER, # ty: ignore[invalid-type-form]
]


Expand All @@ -382,13 +390,7 @@ def attempt_to_get_version(name: str) -> str | int | float | None:

type DataMap = dict[ModelName, SimplifiedModelMeta]

type ModelMap = dict[
ModelMaker,
dict[
ModelName,
tuple[Annotated[HFModelProviders, BeforeValidator(lambda v: Provider.from_string(v))], ...],
],
]
type ModelMap = dict[ModelMaker, dict[ModelName, tuple[HFModelProviders, ...]]]
"""A mapping of model makers to their models and the providers that support each model."""


Expand Down Expand Up @@ -520,29 +522,24 @@ def load(cls) -> RootJson:
return cls.model_validate_json(cls._json_path.read_text())


"""
if JSON_CACHE.exists():
_ROOT = RootJson.load()
DATA = _ROOT.models
MODEL_MAP_DATA = _ROOT.model_map
ALIAS_MAP_DATA = _ROOT.aliases
SPARSE_MODELS = _ROOT.sparse_models

FLATTENED_ALIASES = _ROOT.flattened_aliases
else:
_ROOT = RootJson(models={})
DATA = {}
MODEL_MAP_DATA = {}
ALIAS_MAP_DATA = {}
SPARSE_MODELS = {}
FLATTENED_ALIASES = {}
"""


def mteb_to_capabilities(model: SimplifiedModelMeta) -> PartialCapabilities:
"""
Convert an MTEB model metadata dictionary to a PartialCapabilities object.
"""
loader = getattr(model, "loader", {})
loader = loader if isinstance(loader, dict) else {}
caps = {
"name": model["name"],
"default_dimension": model.get("embed_dim"),
Expand Down
Empty file modified scripts/model_data/secondary_providers.json
100755 β†’ 100644
Empty file.
Empty file modified scripts/model_data/secondary_providers.json.license
100755 β†’ 100644
Empty file.
2 changes: 1 addition & 1 deletion src/codeweaver/cli/commands/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ async def _perform_clear_operation(
response = display.console.input(
"[yellow]Are you sure you want to continue? (yes/no):[/yellow] "
)
if response.lower() not in ["yes", "y"]:
if response.lower() not in {"yes", "y"}:
display.print_info("Operation cancelled")
sys.exit(0)

Expand Down
30 changes: 19 additions & 11 deletions src/codeweaver/core/di/container.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,18 +84,20 @@ def __init__(self) -> None:
self._request_cache: dict[Any, Any] = {} # Keys can be types or callables
self._providers_loaded: bool = False # Track if auto-discovery has run

def _safe_eval_type(self, type_str: str, globalns: dict[str, Any]) -> Any:
def _safe_eval_type(self, type_str: str, globalns: dict[str, Any]) -> Any | None:
"""Safely evaluate a type string using AST validation.

Parses the type string into an AST, validates that it contains only safe
Comment on lines +87 to +90
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed. This PR conflates multiple concerns which makes review difficult and rollback risky. The security fix itself is solid, but it should be separated from:

  1. Major dependency bumps (huggingface-hub 0.36β†’1.7, openai 2.17β†’2.28, etc.)
  2. Pickle→JSON cache migration
  3. Pre-existing lint fixes

Each deserves independent review and testing. If a dependency update causes issues, rolling back would also revert the security fix.

constructs (names, attributes, subscripts, unions, calls), and evaluates
it in a restricted environment with a minimal set of builtins.

Args:
type_str: The string representation of a type.
globalns: The global namespace for evaluation.

Returns:
The evaluated type object.

Raises:
ValueError: If the type string contains forbidden constructs.
The evaluated type object, or None if the type string is invalid or
contains forbidden constructs.
"""
try:
tree = ast.parse(type_str, mode="eval")
Expand Down Expand Up @@ -126,20 +128,25 @@ def generic_visit(self, node: ast.AST) -> None:
ast.keyword,
),
):
raise ValueError(f"Forbidden AST node in type string: {type(node).__name__}")
raise TypeError(f"Forbidden AST node in type string: {type(node).__name__}")

# Block dunder access to prevent escaping the restricted environment
if isinstance(node, ast.Name) and node.id.startswith("__"):
raise ValueError(f"Forbidden dunder name: {node.id}")
raise TypeError(f"Forbidden dunder name: {node.id}")
if isinstance(node, ast.Attribute) and node.attr.startswith("__"):
raise ValueError(f"Forbidden dunder attribute: {node.attr}")
raise TypeError(f"Forbidden dunder attribute: {node.attr}")

super().generic_visit(node)

TypeValidator().visit(tree)
try:
TypeValidator().visit(tree)
except TypeError:
return None

# Restricted eval: only allow basic builtin types to be resolved
# even if they are not in the module's globals.
# Note: `type` is intentionally excluded β€” it is not needed for type annotation
# resolution, and allowing it as a callable would increase the attack surface.
safe_builtins = {
"int": int,
"float": float,
Expand All @@ -150,13 +157,12 @@ def generic_visit(self, node: ast.AST) -> None:
"dict": dict,
"set": set,
"frozenset": frozenset,
"type": type,
"object": object,
"bytes": bytes,
}

code = compile(tree, "<string>", "eval")
return eval(code, {"__builtins__": safe_builtins}, globalns)
return eval(code, {"__builtins__": safe_builtins}, globalns) # noqa: S307

@staticmethod
def _unwrap_annotated(annotation: Any) -> Any:
Expand Down Expand Up @@ -636,6 +642,8 @@ async def resolve(
if self._is_union_type(interface):
instance = await self._resolve_union_interface(interface, cache_key, _resolution_stack)
return cast(T, instance)
if interface is type(None):
return cast(T, None)

# 1. Check overrides first
# We check overrides before tags and singletons because overrides
Expand Down
Loading
Loading