From 1c004593f8230eba3747c9f2e19990293d064ca1 Mon Sep 17 00:00:00 2001 From: deepfates <58602708+deepfates@users.noreply.github.com> Date: Thu, 12 Mar 2026 01:09:12 -0700 Subject: [PATCH] ex: add conformance scaffolding and runtime telemetry events --- ex/lib/cantrip/circle.ex | 15 ++++++- ex/lib/cantrip/conformance/expect.ex | 26 ++++++++++++ ex/lib/cantrip/conformance/fake_llm.ex | 23 ++++++++++ ex/lib/cantrip/conformance/loader.ex | 56 +++++++++++++++++++++++++ ex/lib/cantrip/conformance/runner.ex | 50 ++++++++++++++++++++++ ex/lib/cantrip/conformance/test_case.ex | 12 ++++++ ex/lib/cantrip/entity_server.ex | 37 ++++++++++++++-- ex/test/conformance/fake_llm_test.exs | 22 ++++++++++ ex/test/conformance/loader_test.exs | 28 +++++++++++++ ex/test/conformance_test.exs | 25 +++++++++++ ex/test/m25_telemetry_test.exs | 46 ++++++++++++++++++++ 11 files changed, 336 insertions(+), 4 deletions(-) create mode 100644 ex/lib/cantrip/conformance/expect.ex create mode 100644 ex/lib/cantrip/conformance/fake_llm.ex create mode 100644 ex/lib/cantrip/conformance/loader.ex create mode 100644 ex/lib/cantrip/conformance/runner.ex create mode 100644 ex/lib/cantrip/conformance/test_case.ex create mode 100644 ex/test/conformance/fake_llm_test.exs create mode 100644 ex/test/conformance/loader_test.exs create mode 100644 ex/test/conformance_test.exs create mode 100644 ex/test/m25_telemetry_test.exs diff --git a/ex/lib/cantrip/circle.ex b/ex/lib/cantrip/circle.ex index 3e9ca28..9abddf3 100644 --- a/ex/lib/cantrip/circle.ex +++ b/ex/lib/cantrip/circle.ex @@ -177,7 +177,20 @@ defmodule Cantrip.Circle do } def execute_gate(circle, gate_name, args) do gate_name = canonical_gate_name(gate_name) - do_execute(circle, gate_name, args) + + :telemetry.execute([:cantrip, :gate, :call], %{}, %{gate_name: gate_name, args: args}) + started_at = System.monotonic_time(:millisecond) + + result = do_execute(circle, gate_name, args) + duration = max(System.monotonic_time(:millisecond) - started_at, 0) + + :telemetry.execute( + [:cantrip, :gate, :result], + %{duration: duration}, + %{gate_name: gate_name, args: args, result: result, duration: duration} + ) + + result end @spec gate_names(t()) :: [String.t()] diff --git a/ex/lib/cantrip/conformance/expect.ex b/ex/lib/cantrip/conformance/expect.ex new file mode 100644 index 0000000..ee249aa --- /dev/null +++ b/ex/lib/cantrip/conformance/expect.ex @@ -0,0 +1,26 @@ +defmodule Cantrip.Conformance.Expect do + @moduledoc "Expectation helpers for YAML conformance checks." + + @spec check!(map(), map()) :: :ok + def check!(expect, result) do + Enum.each(expect, fn {key, expected} -> + case key do + "result" -> assert_equal!(expected, result[:result], "result") + :result -> assert_equal!(expected, result[:result], "result") + "truncated" -> assert_equal!(expected, result[:meta][:truncated], "truncated") + :truncated -> assert_equal!(expected, result[:meta][:truncated], "truncated") + "turns" -> assert_equal!(expected, result[:meta][:turns], "turn_count") + :turns -> assert_equal!(expected, result[:meta][:turns], "turn_count") + _ -> :ok + end + end) + + :ok + end + + defp assert_equal!(expected, actual, label) do + if expected != actual do + raise "expectation failed for #{label}: expected #{inspect(expected)} got #{inspect(actual)}" + end + end +end diff --git a/ex/lib/cantrip/conformance/fake_llm.ex b/ex/lib/cantrip/conformance/fake_llm.ex new file mode 100644 index 0000000..b10146d --- /dev/null +++ b/ex/lib/cantrip/conformance/fake_llm.ex @@ -0,0 +1,23 @@ +defmodule Cantrip.Conformance.FakeLLM do + @moduledoc "Deterministic scripted LLM for conformance tests." + + @behaviour Cantrip.LLM + + defstruct responses: [], index: 0 + + @type t :: %__MODULE__{responses: list(map()), index: non_neg_integer()} + + @spec new(list(map())) :: t() + def new(responses) when is_list(responses), do: %__MODULE__{responses: responses, index: 0} + + @impl true + def query(%__MODULE__{} = state, _request) do + case Enum.fetch(state.responses, state.index) do + {:ok, response} -> + {:ok, response, %{state | index: state.index + 1}} + + :error -> + raise RuntimeError, "conformance fake llm responses exhausted at index #{state.index}" + end + end +end diff --git a/ex/lib/cantrip/conformance/loader.ex b/ex/lib/cantrip/conformance/loader.ex new file mode 100644 index 0000000..0ffbc6b --- /dev/null +++ b/ex/lib/cantrip/conformance/loader.ex @@ -0,0 +1,56 @@ +defmodule Cantrip.Conformance.Loader do + @moduledoc "Load and validate conformance tests from tests.yaml." + + alias Cantrip.Conformance.TestCase + + @required_fields ~w(rule name setup action expect)a + + @spec load(Path.t()) :: [TestCase.t()] + def load(path \\ default_path()) do + path + |> read_yaml!() + |> Enum.map(&to_test_case!/1) + end + + defp read_yaml!(path) do + script = "require 'yaml'; require 'json'; puts JSON.generate(YAML.load_file(ARGV[0]))" + + case System.cmd("ruby", ["-e", script, path]) do + {json, 0} -> Jason.decode!(json) + {stderr, status} -> raise "failed to parse yaml (status #{status}): #{stderr}" + end + end + + defp default_path do + Path.expand("../../../../tests.yaml", __DIR__) + end + + defp to_test_case!(raw) when is_map(raw) do + ensure_required_fields!(raw) + + %TestCase{ + id: get_required!(raw, :rule), + description: get_required!(raw, :name), + setup: as_map!(get_required!(raw, :setup), :setup), + action: get_required!(raw, :action), + expect: as_map!(get_required!(raw, :expect), :expect) + } + end + + defp to_test_case!(_), do: raise(ArgumentError, "invalid testcase: expected map") + + defp ensure_required_fields!(raw) do + Enum.each(@required_fields, fn field -> + if Map.get(raw, field) == nil and Map.get(raw, Atom.to_string(field)) == nil do + raise ArgumentError, "invalid testcase schema: missing #{field}" + end + end) + end + + defp get_required!(map, key) do + Map.get(map, key) || Map.get(map, Atom.to_string(key)) + end + + defp as_map!(value, _field) when is_map(value), do: value + defp as_map!(value, field), do: raise(ArgumentError, "invalid testcase schema: #{field} must be map, got #{inspect(value)}") +end diff --git a/ex/lib/cantrip/conformance/runner.ex b/ex/lib/cantrip/conformance/runner.ex new file mode 100644 index 0000000..511511d --- /dev/null +++ b/ex/lib/cantrip/conformance/runner.ex @@ -0,0 +1,50 @@ +defmodule Cantrip.Conformance.Runner do + @moduledoc "Executes tests.yaml conformance cases against Elixir Cantrip runtime." + + alias Cantrip.Conformance.{Expect, FakeLLM, Loader, TestCase} + + @spec run_all!() :: :ok + def run_all! do + Loader.load() + |> Enum.each(&run_case!/1) + + :ok + end + + @spec run_case!(TestCase.t()) :: :ok + def run_case!(%TestCase{} = test_case) do + execute_case(test_case) + |> then(fn result -> Expect.check!(test_case.expect, result) end) + rescue + error -> + reraise RuntimeError, + [message: "conformance case #{test_case.id} (#{test_case.description}) failed: #{Exception.message(error)}"], + __STACKTRACE__ + end + + defp execute_case(%TestCase{setup: setup, action: action}) do + llm_responses = get_in(setup, ["llm", "responses"]) || [] + + {:ok, cantrip} = + Cantrip.new( + llm: {FakeLLM, FakeLLM.new(llm_responses)}, + circle: get_in(setup, ["circle"]) || %{"gates" => ["done"], "wards" => [%{"max_turns" => 10}]}, + identity: get_in(setup, ["identity"]) || %{} + ) + + execute_action(cantrip, action) + end + + defp execute_action(cantrip, %{"cast" => %{"intent" => intent}}) do + {:ok, result, _next_cantrip, _loom, meta} = Cantrip.cast(cantrip, intent) + %{result: result, meta: meta} + end + + defp execute_action(cantrip, action) when is_list(action) do + Enum.reduce(action, %{result: nil, meta: %{}}, fn step, _acc -> execute_action(cantrip, step) end) + end + + defp execute_action(_cantrip, _action) do + raise "unsupported action" + end +end diff --git a/ex/lib/cantrip/conformance/test_case.ex b/ex/lib/cantrip/conformance/test_case.ex new file mode 100644 index 0000000..7568bda --- /dev/null +++ b/ex/lib/cantrip/conformance/test_case.ex @@ -0,0 +1,12 @@ +defmodule Cantrip.Conformance.TestCase do + @enforce_keys [:id, :description, :setup, :action, :expect] + defstruct [:id, :description, :setup, :action, :expect] + + @type t :: %__MODULE__{ + id: String.t(), + description: String.t(), + setup: map(), + action: map() | list(map()), + expect: map() + } +end diff --git a/ex/lib/cantrip/entity_server.ex b/ex/lib/cantrip/entity_server.ex index c573363..d0aed52 100644 --- a/ex/lib/cantrip/entity_server.ex +++ b/ex/lib/cantrip/entity_server.ex @@ -51,8 +51,8 @@ defmodule Cantrip.EntityServer do stream_to = Keyword.get(opts, :stream_to) cancel_on_parent = normalize_cancel_parents(Keyword.get(opts, :cancel_on_parent)) - {:ok, - %__MODULE__{ + state = + %__MODULE__{ cantrip: cantrip, entity_id: entity_id, messages: messages, @@ -63,7 +63,16 @@ defmodule Cantrip.EntityServer do code_state: code_state, stream_to: stream_to, cancel_on_parent: cancel_on_parent - }} + } + + :telemetry.execute([:cantrip, :entity, :start], %{}, %{entity_id: state.entity_id}) + {:ok, state} + end + + @impl true + def terminate(_reason, state) do + :telemetry.execute([:cantrip, :entity, :stop], %{}, %{entity_id: state.entity_id}) + :ok end @impl true @@ -109,6 +118,10 @@ defmodule Cantrip.EntityServer do end defp run_loop(state) do + turn_number = state.turns + 1 + turn_meta = %{entity_id: state.entity_id, turn_number: turn_number, intent: nil} + :telemetry.execute([:cantrip, :turn, :start], %{}, turn_meta) + reason = truncation_reason(state) if reason do @@ -130,6 +143,7 @@ defmodule Cantrip.EntityServer do cumulative_usage: state.usage } + :telemetry.execute([:cantrip, :turn, :stop], %{duration: 0}, turn_meta) {nil, %{state | loom: loom}, meta} else emit_event(state, {:step_start, %{turn: state.turns + 1, entity_id: state.entity_id}}) @@ -173,6 +187,12 @@ defmodule Cantrip.EntityServer do cumulative_usage: state.usage } + :telemetry.execute( + [:cantrip, :turn, :stop], + %{duration: max(System.monotonic_time(:millisecond) - started_at, 1)}, + turn_meta + ) + {message, %{ state @@ -321,6 +341,12 @@ defmodule Cantrip.EntityServer do code_state: next_code_state } + :telemetry.execute( + [:cantrip, :turn, :stop], + %{duration: duration_ms}, + %{entity_id: state.entity_id, turn_number: next_state.turns, intent: nil} + ) + emit_event(state, {:step_complete, %{turn: next_state.turns, terminated: terminated}}) if terminated do @@ -380,6 +406,7 @@ defmodule Cantrip.EntityServer do end defp eval_code_sandboxed(code, code_state, runtime) do + :telemetry.execute([:cantrip, :code, :eval, :start], %{}, %{}) timeout = Circle.code_eval_timeout_ms(runtime.circle) saved_child_llm = Map.get(code_state, :child_llm) @@ -404,11 +431,13 @@ defmodule Cantrip.EntityServer do else: next_state obs = maybe_append_stdio(obs, captured_output) + :telemetry.execute([:cantrip, :code, :eval, :stop], %{}, %{status: :ok}) {next_state, obs, result, terminated} nil -> Task.shutdown(task, :brutal_kill) obs = [%{gate: "code", result: "code evaluation timed out", is_error: true}] + :telemetry.execute([:cantrip, :code, :eval, :stop], %{}, %{status: :timeout}) {code_state, obs, nil, false} end catch @@ -417,6 +446,8 @@ defmodule Cantrip.EntityServer do %{gate: "code", result: "code evaluation crashed: #{inspect(reason)}", is_error: true} ] + :telemetry.execute([:cantrip, :code, :eval, :stop], %{}, %{status: :exit, reason: reason}) + {code_state, obs, nil, false} end diff --git a/ex/test/conformance/fake_llm_test.exs b/ex/test/conformance/fake_llm_test.exs new file mode 100644 index 0000000..2c14efb --- /dev/null +++ b/ex/test/conformance/fake_llm_test.exs @@ -0,0 +1,22 @@ +defmodule Cantrip.Conformance.FakeLLMTest do + use ExUnit.Case, async: true + + alias Cantrip.Conformance.FakeLLM + + @tag :conformance + test "returns scripted responses in order" do + state = FakeLLM.new([%{content: "one"}, %{content: "two"}]) + + assert {:ok, %{content: "one"}, state} = FakeLLM.query(state, %{}) + assert {:ok, %{content: "two"}, _state} = FakeLLM.query(state, %{}) + end + + @tag :conformance + test "raises when responses exhausted" do + state = FakeLLM.new([]) + + assert_raise RuntimeError, ~r/responses exhausted/, fn -> + FakeLLM.query(state, %{}) + end + end +end diff --git a/ex/test/conformance/loader_test.exs b/ex/test/conformance/loader_test.exs new file mode 100644 index 0000000..aa615b8 --- /dev/null +++ b/ex/test/conformance/loader_test.exs @@ -0,0 +1,28 @@ +defmodule Cantrip.Conformance.LoaderTest do + use ExUnit.Case, async: true + + alias Cantrip.Conformance.Loader + + @tag :conformance + test "loader reads exactly 71 tests with required fields" do + tests = Loader.load() + assert length(tests) == 71 + + Enum.each(tests, fn test_case -> + assert is_binary(test_case.id) + assert test_case.setup != nil + assert test_case.action != nil + assert is_map(test_case.expect) + end) + end + + @tag :conformance + test "invalid schema raises" do + path = Path.join(System.tmp_dir!(), "bad_cantrip_conformance.yaml") + File.write!(path, "- name: missing fields\n setup: {}\n") + + assert_raise ArgumentError, ~r/missing rule/, fn -> + Loader.load(path) + end + end +end diff --git a/ex/test/conformance_test.exs b/ex/test/conformance_test.exs new file mode 100644 index 0000000..0842978 --- /dev/null +++ b/ex/test/conformance_test.exs @@ -0,0 +1,25 @@ +defmodule Cantrip.ConformanceTest do + use ExUnit.Case + + alias Cantrip.Conformance.Runner + + @tag :conformance + test "runner executes yaml cases subset smoke" do + assert :ok = Runner.run_case!(hd(Cantrip.Conformance.Loader.load())) + end + + @tag :conformance + test "runner reports failing rule" do + case = %Cantrip.Conformance.TestCase{ + id: "TEST-FAIL", + description: "failing sample", + setup: %{"llm" => %{"responses" => [%{"tool_calls" => [%{"gate" => "done", "args" => %{"answer" => "ok"}}]}]}, "circle" => %{"gates" => ["done"], "wards" => [%{"max_turns" => 5}]}}, + action: %{"cast" => %{"intent" => "hello"}}, + expect: %{"result" => "different"} + } + + assert_raise RuntimeError, ~r/conformance case TEST-FAIL/, fn -> + Runner.run_case!(case) + end + end +end diff --git a/ex/test/m25_telemetry_test.exs b/ex/test/m25_telemetry_test.exs new file mode 100644 index 0000000..7dd7e6d --- /dev/null +++ b/ex/test/m25_telemetry_test.exs @@ -0,0 +1,46 @@ +defmodule M25TelemetryTest do + use ExUnit.Case + + test "emits entity, turn and gate telemetry with metadata" do + {:ok, agent} = Agent.start_link(fn -> [] end) + handler_id = "cantrip-test-#{System.unique_integer([:positive])}" + + events = [ + [:cantrip, :entity, :start], + [:cantrip, :entity, :stop], + [:cantrip, :turn, :start], + [:cantrip, :turn, :stop], + [:cantrip, :gate, :call], + [:cantrip, :gate, :result] + ] + + :ok = + :telemetry.attach_many(handler_id, events, fn event, measurements, metadata, _ -> + Agent.update(agent, &[{event, measurements, metadata} | &1]) + end, nil) + + {:ok, cantrip} = + Cantrip.new( + llm: {Cantrip.FakeLLM, Cantrip.FakeLLM.new([%{tool_calls: [%{gate: "done", args: %{answer: "ok"}}]}])}, + circle: %{gates: ["done"], wards: [%{max_turns: 5}]} + ) + + assert {:ok, "ok", _next, _loom, _meta} = Cantrip.cast(cantrip, "hello") + + :telemetry.detach(handler_id) + + received = Agent.get(agent, &Enum.reverse/1) + Agent.stop(agent) + + assert Enum.any?(received, fn {event, _, _} -> event == [:cantrip, :entity, :start] end) + assert Enum.any?(received, fn {event, _, _} -> event == [:cantrip, :entity, :stop] end) + assert Enum.any?(received, fn {event, _, _} -> event == [:cantrip, :turn, :start] end) + assert Enum.any?(received, fn {event, _, _} -> event == [:cantrip, :turn, :stop] end) + assert Enum.any?(received, fn {event, _, _} -> event == [:cantrip, :gate, :call] end) + assert Enum.any?(received, fn {event, _, _} -> event == [:cantrip, :gate, :result] end) + + assert Enum.all?(received, fn {_event, _measurements, metadata} -> + if Map.has_key?(metadata, :entity_id), do: is_binary(metadata.entity_id), else: true + end) + end +end