Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 14 additions & 1 deletion ex/lib/cantrip/circle.ex
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,20 @@ defmodule Cantrip.Circle do
}
def execute_gate(circle, gate_name, args) do
gate_name = canonical_gate_name(gate_name)
do_execute(circle, gate_name, args)

:telemetry.execute([:cantrip, :gate, :call], %{}, %{gate_name: gate_name, args: args})
started_at = System.monotonic_time(:millisecond)

result = do_execute(circle, gate_name, args)
duration = max(System.monotonic_time(:millisecond) - started_at, 0)

:telemetry.execute(
[:cantrip, :gate, :result],
%{duration: duration},
%{gate_name: gate_name, args: args, result: result, duration: duration}
)

result
end

@spec gate_names(t()) :: [String.t()]
Expand Down
26 changes: 26 additions & 0 deletions ex/lib/cantrip/conformance/expect.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
defmodule Cantrip.Conformance.Expect do
@moduledoc "Expectation helpers for YAML conformance checks."

@spec check!(map(), map()) :: :ok
def check!(expect, result) do
Enum.each(expect, fn {key, expected} ->
case key do
"result" -> assert_equal!(expected, result[:result], "result")
:result -> assert_equal!(expected, result[:result], "result")
"truncated" -> assert_equal!(expected, result[:meta][:truncated], "truncated")
:truncated -> assert_equal!(expected, result[:meta][:truncated], "truncated")
"turns" -> assert_equal!(expected, result[:meta][:turns], "turn_count")
:turns -> assert_equal!(expected, result[:meta][:turns], "turn_count")
_ -> :ok
end
end)

:ok
end

defp assert_equal!(expected, actual, label) do
if expected != actual do
raise "expectation failed for #{label}: expected #{inspect(expected)} got #{inspect(actual)}"
end
end
end
23 changes: 23 additions & 0 deletions ex/lib/cantrip/conformance/fake_llm.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
defmodule Cantrip.Conformance.FakeLLM do
@moduledoc "Deterministic scripted LLM for conformance tests."

@behaviour Cantrip.LLM

defstruct responses: [], index: 0

@type t :: %__MODULE__{responses: list(map()), index: non_neg_integer()}

@spec new(list(map())) :: t()
def new(responses) when is_list(responses), do: %__MODULE__{responses: responses, index: 0}

@impl true
def query(%__MODULE__{} = state, _request) do
case Enum.fetch(state.responses, state.index) do
{:ok, response} ->
{:ok, response, %{state | index: state.index + 1}}

:error ->
raise RuntimeError, "conformance fake llm responses exhausted at index #{state.index}"
end
end
end
56 changes: 56 additions & 0 deletions ex/lib/cantrip/conformance/loader.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
defmodule Cantrip.Conformance.Loader do
@moduledoc "Load and validate conformance tests from tests.yaml."

alias Cantrip.Conformance.TestCase

@required_fields ~w(rule name setup action expect)a

@spec load(Path.t()) :: [TestCase.t()]
def load(path \\ default_path()) do
path
|> read_yaml!()
|> Enum.map(&to_test_case!/1)
end

defp read_yaml!(path) do
script = "require 'yaml'; require 'json'; puts JSON.generate(YAML.load_file(ARGV[0]))"

case System.cmd("ruby", ["-e", script, path]) do
{json, 0} -> Jason.decode!(json)
{stderr, status} -> raise "failed to parse yaml (status #{status}): #{stderr}"
end
end

defp default_path do
Path.expand("../../../../tests.yaml", __DIR__)
end

defp to_test_case!(raw) when is_map(raw) do
ensure_required_fields!(raw)

%TestCase{
id: get_required!(raw, :rule),
description: get_required!(raw, :name),
setup: as_map!(get_required!(raw, :setup), :setup),
action: get_required!(raw, :action),
expect: as_map!(get_required!(raw, :expect), :expect)
}
end

defp to_test_case!(_), do: raise(ArgumentError, "invalid testcase: expected map")

defp ensure_required_fields!(raw) do
Enum.each(@required_fields, fn field ->
if Map.get(raw, field) == nil and Map.get(raw, Atom.to_string(field)) == nil do
raise ArgumentError, "invalid testcase schema: missing #{field}"
end
end)
end

defp get_required!(map, key) do
Map.get(map, key) || Map.get(map, Atom.to_string(key))
end

defp as_map!(value, _field) when is_map(value), do: value
defp as_map!(value, field), do: raise(ArgumentError, "invalid testcase schema: #{field} must be map, got #{inspect(value)}")
end
50 changes: 50 additions & 0 deletions ex/lib/cantrip/conformance/runner.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
defmodule Cantrip.Conformance.Runner do
@moduledoc "Executes tests.yaml conformance cases against Elixir Cantrip runtime."

alias Cantrip.Conformance.{Expect, FakeLLM, Loader, TestCase}

@spec run_all!() :: :ok
def run_all! do
Loader.load()
|> Enum.each(&run_case!/1)

:ok
end

@spec run_case!(TestCase.t()) :: :ok
def run_case!(%TestCase{} = test_case) do
execute_case(test_case)
|> then(fn result -> Expect.check!(test_case.expect, result) end)
rescue
error ->
reraise RuntimeError,
[message: "conformance case #{test_case.id} (#{test_case.description}) failed: #{Exception.message(error)}"],
__STACKTRACE__
end

defp execute_case(%TestCase{setup: setup, action: action}) do
llm_responses = get_in(setup, ["llm", "responses"]) || []

{:ok, cantrip} =
Cantrip.new(
llm: {FakeLLM, FakeLLM.new(llm_responses)},
circle: get_in(setup, ["circle"]) || %{"gates" => ["done"], "wards" => [%{"max_turns" => 10}]},
identity: get_in(setup, ["identity"]) || %{}
)

execute_action(cantrip, action)
end

defp execute_action(cantrip, %{"cast" => %{"intent" => intent}}) do
{:ok, result, _next_cantrip, _loom, meta} = Cantrip.cast(cantrip, intent)
%{result: result, meta: meta}
end

defp execute_action(cantrip, action) when is_list(action) do
Enum.reduce(action, %{result: nil, meta: %{}}, fn step, _acc -> execute_action(cantrip, step) end)
end

defp execute_action(_cantrip, _action) do
raise "unsupported action"
end
end
12 changes: 12 additions & 0 deletions ex/lib/cantrip/conformance/test_case.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
defmodule Cantrip.Conformance.TestCase do
@enforce_keys [:id, :description, :setup, :action, :expect]
defstruct [:id, :description, :setup, :action, :expect]

@type t :: %__MODULE__{
id: String.t(),
description: String.t(),
setup: map(),
action: map() | list(map()),
expect: map()
}
end
37 changes: 34 additions & 3 deletions ex/lib/cantrip/entity_server.ex
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,8 @@ defmodule Cantrip.EntityServer do
stream_to = Keyword.get(opts, :stream_to)
cancel_on_parent = normalize_cancel_parents(Keyword.get(opts, :cancel_on_parent))

{:ok,
%__MODULE__{
state =
%__MODULE__{
cantrip: cantrip,
entity_id: entity_id,
messages: messages,
Expand All @@ -63,7 +63,16 @@ defmodule Cantrip.EntityServer do
code_state: code_state,
stream_to: stream_to,
cancel_on_parent: cancel_on_parent
}}
}

:telemetry.execute([:cantrip, :entity, :start], %{}, %{entity_id: state.entity_id})
{:ok, state}
end

@impl true
def terminate(_reason, state) do
:telemetry.execute([:cantrip, :entity, :stop], %{}, %{entity_id: state.entity_id})
:ok
end

@impl true
Expand Down Expand Up @@ -109,6 +118,10 @@ defmodule Cantrip.EntityServer do
end

defp run_loop(state) do
turn_number = state.turns + 1
turn_meta = %{entity_id: state.entity_id, turn_number: turn_number, intent: nil}
:telemetry.execute([:cantrip, :turn, :start], %{}, turn_meta)

reason = truncation_reason(state)

if reason do
Expand All @@ -130,6 +143,7 @@ defmodule Cantrip.EntityServer do
cumulative_usage: state.usage
}

:telemetry.execute([:cantrip, :turn, :stop], %{duration: 0}, turn_meta)
{nil, %{state | loom: loom}, meta}
else
emit_event(state, {:step_start, %{turn: state.turns + 1, entity_id: state.entity_id}})
Expand Down Expand Up @@ -173,6 +187,12 @@ defmodule Cantrip.EntityServer do
cumulative_usage: state.usage
}

:telemetry.execute(
[:cantrip, :turn, :stop],
%{duration: max(System.monotonic_time(:millisecond) - started_at, 1)},
turn_meta
)

{message,
%{
state
Expand Down Expand Up @@ -321,6 +341,12 @@ defmodule Cantrip.EntityServer do
code_state: next_code_state
}

:telemetry.execute(
[:cantrip, :turn, :stop],
%{duration: duration_ms},
%{entity_id: state.entity_id, turn_number: next_state.turns, intent: nil}
)

emit_event(state, {:step_complete, %{turn: next_state.turns, terminated: terminated}})

if terminated do
Expand Down Expand Up @@ -380,6 +406,7 @@ defmodule Cantrip.EntityServer do
end

defp eval_code_sandboxed(code, code_state, runtime) do
:telemetry.execute([:cantrip, :code, :eval, :start], %{}, %{})
timeout = Circle.code_eval_timeout_ms(runtime.circle)
saved_child_llm = Map.get(code_state, :child_llm)

Expand All @@ -404,11 +431,13 @@ defmodule Cantrip.EntityServer do
else: next_state

obs = maybe_append_stdio(obs, captured_output)
:telemetry.execute([:cantrip, :code, :eval, :stop], %{}, %{status: :ok})
{next_state, obs, result, terminated}

nil ->
Task.shutdown(task, :brutal_kill)
obs = [%{gate: "code", result: "code evaluation timed out", is_error: true}]
:telemetry.execute([:cantrip, :code, :eval, :stop], %{}, %{status: :timeout})
{code_state, obs, nil, false}
end
catch
Expand All @@ -417,6 +446,8 @@ defmodule Cantrip.EntityServer do
%{gate: "code", result: "code evaluation crashed: #{inspect(reason)}", is_error: true}
]

:telemetry.execute([:cantrip, :code, :eval, :stop], %{}, %{status: :exit, reason: reason})

{code_state, obs, nil, false}
end

Expand Down
22 changes: 22 additions & 0 deletions ex/test/conformance/fake_llm_test.exs
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
defmodule Cantrip.Conformance.FakeLLMTest do
use ExUnit.Case, async: true

alias Cantrip.Conformance.FakeLLM

@tag :conformance
test "returns scripted responses in order" do
state = FakeLLM.new([%{content: "one"}, %{content: "two"}])

assert {:ok, %{content: "one"}, state} = FakeLLM.query(state, %{})
assert {:ok, %{content: "two"}, _state} = FakeLLM.query(state, %{})
end

@tag :conformance
test "raises when responses exhausted" do
state = FakeLLM.new([])

assert_raise RuntimeError, ~r/responses exhausted/, fn ->
FakeLLM.query(state, %{})
end
end
end
28 changes: 28 additions & 0 deletions ex/test/conformance/loader_test.exs
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
defmodule Cantrip.Conformance.LoaderTest do
use ExUnit.Case, async: true

alias Cantrip.Conformance.Loader

@tag :conformance
test "loader reads exactly 71 tests with required fields" do
tests = Loader.load()
assert length(tests) == 71

Enum.each(tests, fn test_case ->
assert is_binary(test_case.id)
assert test_case.setup != nil
assert test_case.action != nil
assert is_map(test_case.expect)
end)
end

@tag :conformance
test "invalid schema raises" do
path = Path.join(System.tmp_dir!(), "bad_cantrip_conformance.yaml")
File.write!(path, "- name: missing fields\n setup: {}\n")

assert_raise ArgumentError, ~r/missing rule/, fn ->
Loader.load(path)
end
end
end
25 changes: 25 additions & 0 deletions ex/test/conformance_test.exs
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
defmodule Cantrip.ConformanceTest do
use ExUnit.Case

alias Cantrip.Conformance.Runner

@tag :conformance
test "runner executes yaml cases subset smoke" do
assert :ok = Runner.run_case!(hd(Cantrip.Conformance.Loader.load()))
end

@tag :conformance
test "runner reports failing rule" do
case = %Cantrip.Conformance.TestCase{
id: "TEST-FAIL",
description: "failing sample",
setup: %{"llm" => %{"responses" => [%{"tool_calls" => [%{"gate" => "done", "args" => %{"answer" => "ok"}}]}]}, "circle" => %{"gates" => ["done"], "wards" => [%{"max_turns" => 5}]}},
action: %{"cast" => %{"intent" => "hello"}},
expect: %{"result" => "different"}
}

assert_raise RuntimeError, ~r/conformance case TEST-FAIL/, fn ->
Runner.run_case!(case)
end
end
end
Loading