From ae21179cad6ca49049793247dc4696f4afa923b0 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 1 Mar 2026 23:55:48 +0000 Subject: [PATCH 01/26] Replace community.general.npm with ansible.builtin.command Removes the dependency on the community.general collection which was producing a warning about not supporting the installed Ansible version. Uses npm install --prefix directly instead. https://claude.ai/code/session_01Vm5EEsQ5uFKoni6qWEDQd8 --- ansible/tasks/neovim.yml | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/ansible/tasks/neovim.yml b/ansible/tasks/neovim.yml index ce22212..ab7b438 100644 --- a/ansible/tasks/neovim.yml +++ b/ansible/tasks/neovim.yml @@ -31,11 +31,14 @@ - { src: "nvim/lua", dest: ".config/nvim/lua" } - { src: "nvim/ftplugin", dest: ".config/nvim/ftplugin" } -- name: Install jsonlint node.js package. - community.general.npm: - name: jsonlint - path: "{{ venv_path }}/nvim/bin" - global: false +- name: Install jsonlint node.js package + ansible.builtin.command: + argv: + - npm + - install + - --prefix + - "{{ venv_path }}/nvim" + - jsonlint - name: Run Lazy sync for Neovim plugins ansible.builtin.command: From 9b88edd8d5b78ce720f0b693764bd6117657f1e3 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 2 Mar 2026 03:17:00 +0000 Subject: [PATCH 02/26] Add CI workflow to validate Ansible playbook on Ubuntu Runs the full playbook (minus macOS-only launch agents) on ubuntu-latest: syntax check then an actual apply. Installs Neovim, Python 3.11, and virtualenv so the neovim setup tasks (pip venv, Lazy sync, treesitter) can run too. https://claude.ai/code/session_01Vm5EEsQ5uFKoni6qWEDQd8 --- .github/workflows/ansible.yml | 56 +++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 .github/workflows/ansible.yml diff --git a/.github/workflows/ansible.yml b/.github/workflows/ansible.yml new file mode 100644 index 0000000..06e0c14 --- /dev/null +++ b/.github/workflows/ansible.yml @@ -0,0 +1,56 @@ +name: Ansible Playbook + +on: [push] + +env: + TERM: xterm + +jobs: + playbook: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - name: Symlink checkout to ~/.dotfiles + run: ln -s "$GITHUB_WORKSPACE" "$HOME/.dotfiles" + + - name: Create stubs for files excluded from version control + run: | + touch "$HOME/.dotfiles/.mise.toml" + mkdir -p "$HOME/.dotfiles/ssh" + touch "$HOME/.dotfiles/ssh/config" + mkdir -p "$HOME/.config/fish/completions" + + - name: Install uv + uses: astral-sh/setup-uv@v5 + + - name: Set up Python 3.11 + uses: actions/setup-python@v5 + with: + python-version: "3.11" + + - name: Install Neovim + run: | + curl -LO https://github.com/neovim/neovim/releases/latest/download/nvim-linux-x86_64.tar.gz + sudo tar -C /opt -xzf nvim-linux-x86_64.tar.gz + echo "/opt/nvim-linux-x86_64/bin" >> "$GITHUB_PATH" + + - name: Install virtualenv + run: pip install virtualenv + + - name: Check playbook syntax + run: > + uv run ansible-playbook + -i "localhost," + -c local + "$HOME/.dotfiles/ansible/dotfiles.yml" + --syntax-check + + - name: Run playbook (skip macOS-only tasks) + run: > + uv run ansible-playbook + -i "localhost," + -c local + "$HOME/.dotfiles/ansible/dotfiles.yml" + --skip-tags agents From a7f5784a7f98da5aaf90ef5b1d7f8cb9d52fc437 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 5 Mar 2026 05:46:52 +0000 Subject: [PATCH 03/26] Add dbt keybindings and SQL treesitter support Add dr (dbt run), dc (dbt compile), and dt (dbt test) keybindings that format the current SQL file with sqlfmt via conform, save, then send the dbt command to toggleterm. Also add sql to treesitter ensure_installed for better SQL highlighting. https://claude.ai/code/session_016johXLfEd6P4umaT14YQEQ --- nvim/lua/config/keymaps.lua | 47 +++++++++++++++++++++++++++++++++++ nvim/lua/plugins/language.lua | 1 + 2 files changed, 48 insertions(+) diff --git a/nvim/lua/config/keymaps.lua b/nvim/lua/config/keymaps.lua index 9880f21..6ccc08a 100644 --- a/nvim/lua/config/keymaps.lua +++ b/nvim/lua/config/keymaps.lua @@ -57,3 +57,50 @@ vim.keymap.set("n", "wi", function() }, }) end, { noremap = true, silent = true, desc = "[W]iki [I]nsert Link" }) + +-- dbt: extract model name from current file path (e.g., models/staging/stg_orders.sql -> stg_orders) +local function dbt_model_name() + local filepath = vim.fn.expand("%:t:r") + if filepath == "" then + vim.notify("No file open", vim.log.levels.WARN) + return nil + end + return filepath +end + +-- dbt: format the current SQL file, then send a dbt command to the terminal +local function dbt_cmd(cmd_template) + local model = dbt_model_name() + if not model then + return + end + + -- Format with conform (sqlfmt), then save + require("conform").format({ async = false, lsp_fallback = true }) + vim.cmd("write") + + -- Build the command + local cmd = string.format(cmd_template, model) + + -- Send to toggleterm (terminal 1) + local term = require("toggleterm.terminal").get(1) + if not term then + term = require("toggleterm.terminal").Terminal:new({ id = 1 }) + end + if not term:is_open() then + term:toggle() + end + term:send(cmd) +end + +vim.keymap.set("n", "dr", function() + dbt_cmd("dbt run -s %s") +end, { desc = "[D]bt [R]un current model" }) + +vim.keymap.set("n", "dc", function() + dbt_cmd("dbt compile -s %s") +end, { desc = "[D]bt [C]ompile current model" }) + +vim.keymap.set("n", "dt", function() + dbt_cmd("dbt test -s %s") +end, { desc = "[D]bt [T]est current model" }) diff --git a/nvim/lua/plugins/language.lua b/nvim/lua/plugins/language.lua index 946c320..3546b2d 100644 --- a/nvim/lua/plugins/language.lua +++ b/nvim/lua/plugins/language.lua @@ -33,6 +33,7 @@ return { "python", "query", "regex", + "sql", "vim", "yaml", }, From ab536f3abc3325af27dae636e3d82b085eeabf3f Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 5 Mar 2026 06:01:16 +0000 Subject: [PATCH 04/26] Add more dbt keybindings: build, show, run+downstream - dR: dbt run -s model+ (model and all downstream dependents) - db: dbt build (run + test in DAG order) - ds: dbt show (preview query results without materializing) https://claude.ai/code/session_016johXLfEd6P4umaT14YQEQ --- nvim/lua/config/keymaps.lua | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/nvim/lua/config/keymaps.lua b/nvim/lua/config/keymaps.lua index 6ccc08a..e67fee7 100644 --- a/nvim/lua/config/keymaps.lua +++ b/nvim/lua/config/keymaps.lua @@ -97,6 +97,14 @@ vim.keymap.set("n", "dr", function() dbt_cmd("dbt run -s %s") end, { desc = "[D]bt [R]un current model" }) +vim.keymap.set("n", "dR", function() + dbt_cmd("dbt run -s %s+") +end, { desc = "[D]bt [R]un model + downstream" }) + +vim.keymap.set("n", "db", function() + dbt_cmd("dbt build -s %s") +end, { desc = "[D]bt [B]uild current model (run + test)" }) + vim.keymap.set("n", "dc", function() dbt_cmd("dbt compile -s %s") end, { desc = "[D]bt [C]ompile current model" }) @@ -104,3 +112,7 @@ end, { desc = "[D]bt [C]ompile current model" }) vim.keymap.set("n", "dt", function() dbt_cmd("dbt test -s %s") end, { desc = "[D]bt [T]est current model" }) + +vim.keymap.set("n", "ds", function() + dbt_cmd("dbt show -s %s") +end, { desc = "[D]bt [S]how preview results" }) From b6171e346ee5d357223f9251d6b7f42bbc47836c Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 5 Mar 2026 20:49:44 +0000 Subject: [PATCH 05/26] Add dbt navigation, fuzzy model picker, and model search - dg: jump to ref() or source() under cursor - df: fzf model picker with ctrl-r/b/t to run/build/test - do: open compiled SQL in readonly vsplit - d/: grep across all models (find columns, CTEs, etc.) Inspired by fzf-dbt CLI tool, adapted for neovim with fzf-lua. https://claude.ai/code/session_016johXLfEd6P4umaT14YQEQ --- nvim/lua/config/keymaps.lua | 160 ++++++++++++++++++++++++++++++++++++ 1 file changed, 160 insertions(+) diff --git a/nvim/lua/config/keymaps.lua b/nvim/lua/config/keymaps.lua index e67fee7..b712a3e 100644 --- a/nvim/lua/config/keymaps.lua +++ b/nvim/lua/config/keymaps.lua @@ -93,6 +93,166 @@ local function dbt_cmd(cmd_template) term:send(cmd) end +-- dbt: find the project root (directory containing dbt_project.yml) +local function dbt_project_root() + local path = vim.fn.findfile("dbt_project.yml", ".;") + if path == "" then + return nil + end + return vim.fn.fnamemodify(path, ":p:h") +end + +-- dbt: jump to model under cursor from {{ ref('model_name') }} or {{ source('src', 'table') }} +vim.keymap.set("n", "dg", function() + local line = vim.api.nvim_get_current_line() + local col = vim.api.nvim_win_get_cursor(0)[2] + 1 + + -- Try to find ref('model') or ref("model") around cursor + local ref_model = nil + for start_pos, name, end_pos in line:gmatch("()ref%(['\"]([^'\"]+)['\"]%)()" ) do + if col >= start_pos and col <= end_pos then + ref_model = name + break + end + end + + -- Try source('source_name', 'table_name') if no ref found + local source_name, source_table = nil, nil + if not ref_model then + for start_pos, src, tbl, end_pos in line:gmatch("()source%(['\"]([^'\"]+)['\"]%s*,%s*['\"]([^'\"]+)['\"]%)()" ) do + if col >= start_pos and col <= end_pos then + source_name, source_table = src, tbl + break + end + end + end + + if not ref_model and not source_table then + vim.notify("No ref() or source() under cursor", vim.log.levels.WARN) + return + end + + local root = dbt_project_root() + if not root then + vim.notify("No dbt_project.yml found", vim.log.levels.WARN) + return + end + + -- Search for the model file + local search_name = ref_model or source_table + local matches = vim.fn.globpath(root, "**/" .. search_name .. ".sql", false, true) + if #matches == 0 then + -- Also try .yml for source definitions + matches = vim.fn.globpath(root, "**/" .. search_name .. ".yml", false, true) + end + + if #matches == 1 then + vim.cmd.edit(matches[1]) + elseif #matches > 1 then + vim.ui.select(matches, { prompt = "Multiple matches:" }, function(choice) + if choice then + vim.cmd.edit(choice) + end + end) + else + vim.notify("No file found for: " .. search_name, vim.log.levels.WARN) + end +end, { desc = "[D]bt [G]o to ref/source" }) + +-- dbt: send a raw command string to toggleterm (used by fzf actions) +local function dbt_cmd_raw(cmd) + local term = require("toggleterm.terminal").get(1) + if not term then + term = require("toggleterm.terminal").Terminal:new({ id = 1 }) + end + if not term:is_open() then + term:toggle() + end + term:send(cmd) +end + +-- dbt: fuzzy model picker — select a model then choose an action +vim.keymap.set("n", "df", function() + local root = dbt_project_root() + if not root then + vim.notify("No dbt_project.yml found", vim.log.levels.WARN) + return + end + + local fzf = require("fzf-lua") + fzf.files({ + prompt = "dbt model ", + cwd = root, + cmd = "fd -e sql . models", + actions = { + -- Default: open the file + ["default"] = fzf.actions.file_edit, + -- Ctrl-r: run the selected model + ["ctrl-r"] = function(selected) + if selected and #selected > 0 then + local name = selected[1]:match("([^/]+)%.sql$") + if name then + dbt_cmd_raw("dbt run -s " .. name) + end + end + end, + -- Ctrl-b: build the selected model + ["ctrl-b"] = function(selected) + if selected and #selected > 0 then + local name = selected[1]:match("([^/]+)%.sql$") + if name then + dbt_cmd_raw("dbt build -s " .. name) + end + end + end, + -- Ctrl-t: test the selected model + ["ctrl-t"] = function(selected) + if selected and #selected > 0 then + local name = selected[1]:match("([^/]+)%.sql$") + if name then + dbt_cmd_raw("dbt test -s " .. name) + end + end + end, + }, + fzf_opts = { + ["--header"] = "enter=open | ctrl-r=run | ctrl-b=build | ctrl-t=test", + ["--multi"] = true, + }, + }) +end, { desc = "[D]bt [F]ind model (fzf)" }) + +-- dbt: open the compiled SQL for the current model in a split +vim.keymap.set("n", "do", function() + local model = dbt_model_name() + if not model then + return + end + local root = dbt_project_root() + if not root then + vim.notify("No dbt_project.yml found", vim.log.levels.WARN) + return + end + local compiled = vim.fn.globpath(root, "target/compiled/**/" .. model .. ".sql", false, true) + if #compiled == 0 then + vim.notify("No compiled SQL found — run dbt compile first", vim.log.levels.WARN) + return + end + vim.cmd("vsplit " .. compiled[1]) + vim.bo.readonly = true + vim.bo.modifiable = false +end, { desc = "[D]bt [O]pen compiled SQL" }) + +-- dbt: grep across all models (search for column names, CTEs, etc.) +vim.keymap.set("n", "d/", function() + local root = dbt_project_root() + if not root then + vim.notify("No dbt_project.yml found", vim.log.levels.WARN) + return + end + require("fzf-lua").grep({ prompt = "dbt grep ", cwd = root .. "/models" }) +end, { desc = "[D]bt search models" }) + vim.keymap.set("n", "dr", function() dbt_cmd("dbt run -s %s") end, { desc = "[D]bt [R]un current model" }) From 93d5ca47ebdb3b3cb61885062059e7c6ae14acf2 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 5 Mar 2026 21:33:26 +0000 Subject: [PATCH 06/26] Add claude agent keymaps for dbt model analysis MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit da: quick analysis with sonnet — reviews the current model and suggests improvements in non-interactive mode. dA: deep analysis with sonnet thinking — interrogates the duckdb database and cross-references with the current model to check data quality, joins, types, and more. https://claude.ai/code/session_016johXLfEd6P4umaT14YQEQ --- nvim/lua/config/keymaps.lua | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/nvim/lua/config/keymaps.lua b/nvim/lua/config/keymaps.lua index b712a3e..e7bede1 100644 --- a/nvim/lua/config/keymaps.lua +++ b/nvim/lua/config/keymaps.lua @@ -276,3 +276,35 @@ end, { desc = "[D]bt [T]est current model" }) vim.keymap.set("n", "ds", function() dbt_cmd("dbt show -s %s") end, { desc = "[D]bt [S]how preview results" }) + +-- dbt: run claude agent on current model (quick analysis with sonnet) +vim.keymap.set("n", "da", function() + local filepath = vim.fn.expand("%:p") + if filepath == "" then + vim.notify("No file open", vim.log.levels.WARN) + return + end + local cmd = string.format( + 'claude -p "Review this dbt model and add brief comments suggesting improvements, potential issues, or best-practice violations. Be concise." --model claude-sonnet-4-6 %s', + vim.fn.shellescape(filepath) + ) + dbt_cmd_raw(cmd) +end, { desc = "[D]bt [A]nalyse model (quick)" }) + +-- dbt: run claude agent on current model (deep analysis with sonnet thinking, cross-reference DB) +vim.keymap.set("n", "dA", function() + local filepath = vim.fn.expand("%:p") + if filepath == "" then + vim.notify("No file open", vim.log.levels.WARN) + return + end + local model = dbt_model_name() + if not model then + return + end + local cmd = string.format( + 'claude -p "You have access to a duckdb database. Interrogate the database to understand the schema and data, then cross-reference with this dbt model. Check for: data quality issues, join correctness, missing filters, column type mismatches, and potential improvements. Run queries to validate assumptions." --model claude-sonnet-4-6 --thinking %s', + vim.fn.shellescape(filepath) + ) + dbt_cmd_raw(cmd) +end, { desc = "[D]bt [A]nalyse model (deep, DB cross-ref)" }) From 45dcc020b5b80bce61e9f005714ec5ab9ce4bcc5 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 5 Mar 2026 21:36:25 +0000 Subject: [PATCH 07/26] Include compiled SQL and sample rows in deep dbt analysis The dA keymap now compiles the model first, then passes the compiled SQL and first 20 rows from dbt show as extra context to the claude agent for more informed analysis. https://claude.ai/code/session_016johXLfEd6P4umaT14YQEQ --- nvim/lua/config/keymaps.lua | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/nvim/lua/config/keymaps.lua b/nvim/lua/config/keymaps.lua index e7bede1..8bffc11 100644 --- a/nvim/lua/config/keymaps.lua +++ b/nvim/lua/config/keymaps.lua @@ -292,6 +292,7 @@ vim.keymap.set("n", "da", function() end, { desc = "[D]bt [A]nalyse model (quick)" }) -- dbt: run claude agent on current model (deep analysis with sonnet thinking, cross-reference DB) +-- Compiles the model first, then passes compiled SQL + first 20 rows as extra context vim.keymap.set("n", "dA", function() local filepath = vim.fn.expand("%:p") if filepath == "" then @@ -302,8 +303,28 @@ vim.keymap.set("n", "dA", function() if not model then return end + local root = dbt_project_root() + if not root then + vim.notify("No dbt_project.yml found", vim.log.levels.WARN) + return + end + + -- Build a shell script that: + -- 1. Compiles the model and captures compiled SQL + -- 2. Runs dbt show --limit 20 to get sample rows + -- 3. Feeds everything to claude local cmd = string.format( - 'claude -p "You have access to a duckdb database. Interrogate the database to understand the schema and data, then cross-reference with this dbt model. Check for: data quality issues, join correctness, missing filters, column type mismatches, and potential improvements. Run queries to validate assumptions." --model claude-sonnet-4-6 --thinking %s', + [[dbt compile -s %s --quiet && compiled_sql=$(cat $(find %s/target/compiled -name '%s.sql' | head -1) 2>/dev/null) && sample_rows=$(dbt show -s %s --limit 20 2>/dev/null) && claude -p "You have access to a duckdb database. Interrogate the database to understand the schema and data, then cross-reference with this dbt model. Check for: data quality issues, join correctness, missing filters, column type mismatches, and potential improvements. Run queries to validate assumptions. + +Here is the compiled SQL: +${compiled_sql} + +Here are the first 20 rows returned by this model: +${sample_rows}" --model claude-sonnet-4-6 --thinking %s]], + model, + vim.fn.shellescape(root), + model, + model, vim.fn.shellescape(filepath) ) dbt_cmd_raw(cmd) From 70af540b05fb2a3d07fde78dca38680611251fa8 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 5 Mar 2026 21:37:27 +0000 Subject: [PATCH 08/26] Move dbt agent prompts to separate markdown template files Prompts now live in nvim/prompts/ as markdown files with {{var}} template placeholders. The quick analysis prompt is loaded and substituted in Lua; the deep analysis prompt uses sed at runtime to inject compiled SQL and sample rows before passing to claude. https://claude.ai/code/session_016johXLfEd6P4umaT14YQEQ --- nvim/lua/config/keymaps.lua | 42 +++++++++++++++++++++--------- nvim/prompts/dbt_deep_analysis.md | 7 +++++ nvim/prompts/dbt_quick_analysis.md | 1 + 3 files changed, 38 insertions(+), 12 deletions(-) create mode 100644 nvim/prompts/dbt_deep_analysis.md create mode 100644 nvim/prompts/dbt_quick_analysis.md diff --git a/nvim/lua/config/keymaps.lua b/nvim/lua/config/keymaps.lua index 8bffc11..6d12068 100644 --- a/nvim/lua/config/keymaps.lua +++ b/nvim/lua/config/keymaps.lua @@ -277,6 +277,22 @@ vim.keymap.set("n", "ds", function() dbt_cmd("dbt show -s %s") end, { desc = "[D]bt [S]how preview results" }) +-- dbt: read a prompt template from nvim/prompts/ and substitute {{key}} placeholders +local function dbt_load_prompt(name, vars) + local prompt_dir = vim.fn.stdpath("config") .. "/prompts/" + local path = prompt_dir .. name .. ".md" + local lines = vim.fn.readfile(path) + if #lines == 0 then + vim.notify("Prompt not found: " .. path, vim.log.levels.ERROR) + return nil + end + local prompt = table.concat(lines, "\n") + for key, value in pairs(vars or {}) do + prompt = prompt:gsub("{{" .. key .. "}}", value) + end + return prompt +end + -- dbt: run claude agent on current model (quick analysis with sonnet) vim.keymap.set("n", "da", function() local filepath = vim.fn.expand("%:p") @@ -284,8 +300,13 @@ vim.keymap.set("n", "da", function() vim.notify("No file open", vim.log.levels.WARN) return end + local prompt = dbt_load_prompt("dbt_quick_analysis", {}) + if not prompt then + return + end local cmd = string.format( - 'claude -p "Review this dbt model and add brief comments suggesting improvements, potential issues, or best-practice violations. Be concise." --model claude-sonnet-4-6 %s', + "claude -p %s --model claude-sonnet-4-6 %s", + vim.fn.shellescape(prompt), vim.fn.shellescape(filepath) ) dbt_cmd_raw(cmd) @@ -309,22 +330,19 @@ vim.keymap.set("n", "dA", function() return end - -- Build a shell script that: - -- 1. Compiles the model and captures compiled SQL - -- 2. Runs dbt show --limit 20 to get sample rows - -- 3. Feeds everything to claude + -- Compile + gather sample rows, then template the prompt and pass to claude + local prompt_path = vim.fn.stdpath("config") .. "/prompts/dbt_deep_analysis.md" local cmd = string.format( - [[dbt compile -s %s --quiet && compiled_sql=$(cat $(find %s/target/compiled -name '%s.sql' | head -1) 2>/dev/null) && sample_rows=$(dbt show -s %s --limit 20 2>/dev/null) && claude -p "You have access to a duckdb database. Interrogate the database to understand the schema and data, then cross-reference with this dbt model. Check for: data quality issues, join correctness, missing filters, column type mismatches, and potential improvements. Run queries to validate assumptions. - -Here is the compiled SQL: -${compiled_sql} - -Here are the first 20 rows returned by this model: -${sample_rows}" --model claude-sonnet-4-6 --thinking %s]], + [[dbt compile -s %s --quiet ]] + .. [[&& compiled_sql=$(cat $(find %s/target/compiled -name '%s.sql' | head -1) 2>/dev/null) ]] + .. [[&& sample_rows=$(dbt show -s %s --limit 20 2>/dev/null) ]] + .. [[&& prompt=$(sed -e "s|{{compiled_sql}}|${compiled_sql}|g" -e "s|{{sample_rows}}|${sample_rows}|g" %s) ]] + .. [[&& claude -p "${prompt}" --model claude-sonnet-4-6 --thinking %s]], model, vim.fn.shellescape(root), model, model, + vim.fn.shellescape(prompt_path), vim.fn.shellescape(filepath) ) dbt_cmd_raw(cmd) diff --git a/nvim/prompts/dbt_deep_analysis.md b/nvim/prompts/dbt_deep_analysis.md new file mode 100644 index 0000000..397ca3d --- /dev/null +++ b/nvim/prompts/dbt_deep_analysis.md @@ -0,0 +1,7 @@ +You have access to a duckdb database. Interrogate the database to understand the schema and data, then cross-reference with this dbt model. Check for: data quality issues, join correctness, missing filters, column type mismatches, and potential improvements. Run queries to validate assumptions. + +Here is the compiled SQL: +{{compiled_sql}} + +Here are the first 20 rows returned by this model: +{{sample_rows}} diff --git a/nvim/prompts/dbt_quick_analysis.md b/nvim/prompts/dbt_quick_analysis.md new file mode 100644 index 0000000..ee1eeea --- /dev/null +++ b/nvim/prompts/dbt_quick_analysis.md @@ -0,0 +1 @@ +Review this dbt model and add brief comments suggesting improvements, potential issues, or best-practice violations. Be concise. From 25dc51ae30c0d19f0eb7a1bef7670ccbec55fc95 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 5 Mar 2026 21:41:28 +0000 Subject: [PATCH 09/26] Write inline comments to buffer and prefix all dbt commands with uv run - da now replaces the buffer with the annotated SQL (undo with :u) - Updated prompt to allow brief explanations alongside suggestions - All dbt commands (run, build, test, compile, show) now use uv run https://claude.ai/code/session_016johXLfEd6P4umaT14YQEQ --- nvim/lua/config/keymaps.lua | 61 +++++++++++++++++++++--------- nvim/prompts/dbt_quick_analysis.md | 5 ++- 2 files changed, 48 insertions(+), 18 deletions(-) diff --git a/nvim/lua/config/keymaps.lua b/nvim/lua/config/keymaps.lua index 6d12068..78cf646 100644 --- a/nvim/lua/config/keymaps.lua +++ b/nvim/lua/config/keymaps.lua @@ -192,7 +192,7 @@ vim.keymap.set("n", "df", function() if selected and #selected > 0 then local name = selected[1]:match("([^/]+)%.sql$") if name then - dbt_cmd_raw("dbt run -s " .. name) + dbt_cmd_raw("uv run dbt run -s " .. name) end end end, @@ -201,7 +201,7 @@ vim.keymap.set("n", "df", function() if selected and #selected > 0 then local name = selected[1]:match("([^/]+)%.sql$") if name then - dbt_cmd_raw("dbt build -s " .. name) + dbt_cmd_raw("uv run dbt build -s " .. name) end end end, @@ -210,7 +210,7 @@ vim.keymap.set("n", "df", function() if selected and #selected > 0 then local name = selected[1]:match("([^/]+)%.sql$") if name then - dbt_cmd_raw("dbt test -s " .. name) + dbt_cmd_raw("uv run dbt test -s " .. name) end end end, @@ -254,27 +254,27 @@ vim.keymap.set("n", "d/", function() end, { desc = "[D]bt search models" }) vim.keymap.set("n", "dr", function() - dbt_cmd("dbt run -s %s") + dbt_cmd("uv run dbt run -s %s") end, { desc = "[D]bt [R]un current model" }) vim.keymap.set("n", "dR", function() - dbt_cmd("dbt run -s %s+") + dbt_cmd("uv run dbt run -s %s+") end, { desc = "[D]bt [R]un model + downstream" }) vim.keymap.set("n", "db", function() - dbt_cmd("dbt build -s %s") + dbt_cmd("uv run dbt build -s %s") end, { desc = "[D]bt [B]uild current model (run + test)" }) vim.keymap.set("n", "dc", function() - dbt_cmd("dbt compile -s %s") + dbt_cmd("uv run dbt compile -s %s") end, { desc = "[D]bt [C]ompile current model" }) vim.keymap.set("n", "dt", function() - dbt_cmd("dbt test -s %s") + dbt_cmd("uv run dbt test -s %s") end, { desc = "[D]bt [T]est current model" }) vim.keymap.set("n", "ds", function() - dbt_cmd("dbt show -s %s") + dbt_cmd("uv run dbt show -s %s") end, { desc = "[D]bt [S]how preview results" }) -- dbt: read a prompt template from nvim/prompts/ and substitute {{key}} placeholders @@ -294,6 +294,7 @@ local function dbt_load_prompt(name, vars) end -- dbt: run claude agent on current model (quick analysis with sonnet) +-- Replaces buffer contents with the file + inline SQL comments vim.keymap.set("n", "da", function() local filepath = vim.fn.expand("%:p") if filepath == "" then @@ -304,12 +305,38 @@ vim.keymap.set("n", "da", function() if not prompt then return end - local cmd = string.format( - "claude -p %s --model claude-sonnet-4-6 %s", - vim.fn.shellescape(prompt), - vim.fn.shellescape(filepath) - ) - dbt_cmd_raw(cmd) + local bufnr = vim.api.nvim_get_current_buf() + + vim.notify("Running quick analysis...", vim.log.levels.INFO) + + local cmd = { "claude", "-p", prompt, "--model", "claude-sonnet-4-6", filepath } + local output = {} + vim.fn.jobstart(cmd, { + stdout_buffered = true, + on_stdout = function(_, data) + if data then + output = data + end + end, + on_exit = function(_, exit_code) + vim.schedule(function() + if exit_code ~= 0 then + vim.notify("claude exited with code " .. exit_code, vim.log.levels.ERROR) + return + end + -- Remove trailing empty strings from jobstart output + while #output > 0 and output[#output] == "" do + table.remove(output) + end + if #output == 0 then + vim.notify("No output from claude", vim.log.levels.WARN) + return + end + vim.api.nvim_buf_set_lines(bufnr, 0, -1, false, output) + vim.notify("Inline comments added — review and :w to save, or :u to undo", vim.log.levels.INFO) + end) + end, + }) end, { desc = "[D]bt [A]nalyse model (quick)" }) -- dbt: run claude agent on current model (deep analysis with sonnet thinking, cross-reference DB) @@ -333,9 +360,9 @@ vim.keymap.set("n", "dA", function() -- Compile + gather sample rows, then template the prompt and pass to claude local prompt_path = vim.fn.stdpath("config") .. "/prompts/dbt_deep_analysis.md" local cmd = string.format( - [[dbt compile -s %s --quiet ]] + [[uv run dbt compile -s %s --quiet ]] .. [[&& compiled_sql=$(cat $(find %s/target/compiled -name '%s.sql' | head -1) 2>/dev/null) ]] - .. [[&& sample_rows=$(dbt show -s %s --limit 20 2>/dev/null) ]] + .. [[&& sample_rows=$(uv run dbt show -s %s --limit 20 2>/dev/null) ]] .. [[&& prompt=$(sed -e "s|{{compiled_sql}}|${compiled_sql}|g" -e "s|{{sample_rows}}|${sample_rows}|g" %s) ]] .. [[&& claude -p "${prompt}" --model claude-sonnet-4-6 --thinking %s]], model, diff --git a/nvim/prompts/dbt_quick_analysis.md b/nvim/prompts/dbt_quick_analysis.md index ee1eeea..6eb4511 100644 --- a/nvim/prompts/dbt_quick_analysis.md +++ b/nvim/prompts/dbt_quick_analysis.md @@ -1 +1,4 @@ -Review this dbt model and add brief comments suggesting improvements, potential issues, or best-practice violations. Be concise. +Output the complete SQL file with inline comments added as SQL comments (-- ). +Add brief comments suggesting improvements, potential issues, or best-practice violations. +Where appropriate, include a short explanation of why the suggestion matters. +Output ONLY the SQL with comments, no markdown fences, no preamble. From c7d612f7a026269913b11377465060f4ef9ea419 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 5 Mar 2026 21:42:12 +0000 Subject: [PATCH 10/26] Open deep dbt analysis in interactive tmux window dA now opens a new tmux window named 'dbt:' that compiles the model, gathers sample rows, then starts an interactive claude session with that context so you can discuss the model. https://claude.ai/code/session_016johXLfEd6P4umaT14YQEQ --- nvim/lua/config/keymaps.lua | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/nvim/lua/config/keymaps.lua b/nvim/lua/config/keymaps.lua index 78cf646..28a9ac2 100644 --- a/nvim/lua/config/keymaps.lua +++ b/nvim/lua/config/keymaps.lua @@ -339,8 +339,7 @@ vim.keymap.set("n", "da", function() }) end, { desc = "[D]bt [A]nalyse model (quick)" }) --- dbt: run claude agent on current model (deep analysis with sonnet thinking, cross-reference DB) --- Compiles the model first, then passes compiled SQL + first 20 rows as extra context +-- dbt: open interactive claude session in a new tmux window with compiled SQL + sample rows as context vim.keymap.set("n", "dA", function() local filepath = vim.fn.expand("%:p") if filepath == "" then @@ -357,14 +356,17 @@ vim.keymap.set("n", "dA", function() return end - -- Compile + gather sample rows, then template the prompt and pass to claude + -- Open a new tmux window that compiles, gathers context, then starts interactive claude local prompt_path = vim.fn.stdpath("config") .. "/prompts/dbt_deep_analysis.md" local cmd = string.format( - [[uv run dbt compile -s %s --quiet ]] - .. [[&& compiled_sql=$(cat $(find %s/target/compiled -name '%s.sql' | head -1) 2>/dev/null) ]] + [[tmux new-window -n 'dbt:%s' ']] + .. [[uv run dbt compile -s %s --quiet ]] + .. [[&& compiled_sql=$(cat $(find %s/target/compiled -name "%s.sql" | head -1) 2>/dev/null) ]] .. [[&& sample_rows=$(uv run dbt show -s %s --limit 20 2>/dev/null) ]] .. [[&& prompt=$(sed -e "s|{{compiled_sql}}|${compiled_sql}|g" -e "s|{{sample_rows}}|${sample_rows}|g" %s) ]] - .. [[&& claude -p "${prompt}" --model claude-sonnet-4-6 --thinking %s]], + .. [[&& claude --model claude-sonnet-4-6 --thinking --prompt "${prompt}" %s ]] + .. [[|| read -p "Press enter to close..."']], + model, model, vim.fn.shellescape(root), model, @@ -372,5 +374,6 @@ vim.keymap.set("n", "dA", function() vim.fn.shellescape(prompt_path), vim.fn.shellescape(filepath) ) - dbt_cmd_raw(cmd) -end, { desc = "[D]bt [A]nalyse model (deep, DB cross-ref)" }) + vim.fn.system(cmd) + vim.notify("Opened interactive claude session in tmux window 'dbt:" .. model .. "'", vim.log.levels.INFO) +end, { desc = "[D]bt [A]nalyse model (interactive)" }) From 2038e17ac1f3516323dc1ea5e6c163b5f60eb74f Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 5 Mar 2026 21:51:55 +0000 Subject: [PATCH 11/26] Add dp to preview dbt model rows in a split Runs dbt show --limit 20 asynchronously and displays the results in a read-only scratch buffer. Press q to close. https://claude.ai/code/session_016johXLfEd6P4umaT14YQEQ --- nvim/lua/config/keymaps.lua | 51 +++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/nvim/lua/config/keymaps.lua b/nvim/lua/config/keymaps.lua index 28a9ac2..f01ae73 100644 --- a/nvim/lua/config/keymaps.lua +++ b/nvim/lua/config/keymaps.lua @@ -277,6 +277,57 @@ vim.keymap.set("n", "ds", function() dbt_cmd("uv run dbt show -s %s") end, { desc = "[D]bt [S]how preview results" }) +-- dbt: preview sample rows in a horizontal split +vim.keymap.set("n", "dp", function() + local model = dbt_model_name() + if not model then + return + end + vim.notify("Fetching preview for " .. model .. "...", vim.log.levels.INFO) + local cmd = { "uv", "run", "dbt", "show", "-s", model, "--limit", "20" } + local output = {} + vim.fn.jobstart(cmd, { + cwd = dbt_project_root(), + stdout_buffered = true, + stderr_buffered = true, + on_stdout = function(_, data) + if data then + vim.list_extend(output, data) + end + end, + on_exit = function(_, exit_code) + vim.schedule(function() + if exit_code ~= 0 then + vim.notify("dbt show failed (exit " .. exit_code .. ")", vim.log.levels.ERROR) + return + end + -- Trim trailing empty lines + while #output > 0 and output[#output] == "" do + table.remove(output) + end + if #output == 0 then + vim.notify("No rows returned", vim.log.levels.WARN) + return + end + -- Open a scratch buffer in a horizontal split + vim.cmd("botright new") + local buf = vim.api.nvim_get_current_buf() + vim.bo[buf].buftype = "nofile" + vim.bo[buf].bufhidden = "wipe" + vim.bo[buf].swapfile = false + vim.bo[buf].filetype = "sql" + vim.api.nvim_buf_set_name(buf, "dbt-preview://" .. model) + vim.api.nvim_buf_set_lines(buf, 0, -1, false, output) + vim.bo[buf].modifiable = false + -- Resize to fit content (max 20 lines) + local height = math.min(#output, 20) + vim.api.nvim_win_set_height(0, height) + vim.keymap.set("n", "q", "close", { buffer = buf, silent = true }) + end) + end, + }) +end, { desc = "[D]bt [P]review model rows" }) + -- dbt: read a prompt template from nvim/prompts/ and substitute {{key}} placeholders local function dbt_load_prompt(name, vars) local prompt_dir = vim.fn.stdpath("config") .. "/prompts/" From 0885d1559ffa8fc11500f2feee532674eb50190e Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 5 Mar 2026 21:54:09 +0000 Subject: [PATCH 12/26] Return cursor to code buffer after dbt_cmd sends to toggleterm https://claude.ai/code/session_016johXLfEd6P4umaT14YQEQ --- nvim/lua/config/keymaps.lua | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/nvim/lua/config/keymaps.lua b/nvim/lua/config/keymaps.lua index f01ae73..84e005e 100644 --- a/nvim/lua/config/keymaps.lua +++ b/nvim/lua/config/keymaps.lua @@ -82,7 +82,8 @@ local function dbt_cmd(cmd_template) -- Build the command local cmd = string.format(cmd_template, model) - -- Send to toggleterm (terminal 1) + -- Send to toggleterm (terminal 1), then return focus to the code window + local prev_win = vim.api.nvim_get_current_win() local term = require("toggleterm.terminal").get(1) if not term then term = require("toggleterm.terminal").Terminal:new({ id = 1 }) @@ -91,6 +92,7 @@ local function dbt_cmd(cmd_template) term:toggle() end term:send(cmd) + vim.api.nvim_set_current_win(prev_win) end -- dbt: find the project root (directory containing dbt_project.yml) From ea2df0d180322445d7c2f9267758cec830377cfa Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 5 Mar 2026 22:00:24 +0000 Subject: [PATCH 13/26] Replace fzf-lua with telescope.builtin across all keymaps fzf-lua is not installed; the config uses telescope via LazyVim. Converts wiki search, wiki insert link, dbt model finder (with C-r/C-b/C-t actions), and dbt grep to telescope equivalents. https://claude.ai/code/session_016johXLfEd6P4umaT14YQEQ --- nvim/lua/config/keymaps.lua | 107 ++++++++++++++++-------------------- 1 file changed, 46 insertions(+), 61 deletions(-) diff --git a/nvim/lua/config/keymaps.lua b/nvim/lua/config/keymaps.lua index 84e005e..04a9ea6 100644 --- a/nvim/lua/config/keymaps.lua +++ b/nvim/lua/config/keymaps.lua @@ -17,8 +17,7 @@ end, { desc = "[I]nsert [D]ate" }) -- Shortcut for searching the wiki vim.keymap.set("n", "ws", function() - local fzf = require("fzf-lua") - fzf.files({ prompt = "wiki  ", cwd = vim.g.wiki_root }) + require("telescope.builtin").find_files({ prompt_title = "Wiki", cwd = vim.g.wiki_root }) end, { desc = "[W]iki [S]earch" }) -- Create a new page in the wiki @@ -33,28 +32,23 @@ vim.keymap.set("n", "wn", function() end, { desc = "[W]iki [N]ew Page" }) vim.keymap.set("n", "wi", function() - local fzf = require("fzf-lua") - fzf.files({ - prompt = "Wiki Files", + local actions = require("telescope.actions") + local action_state = require("telescope.actions.state") + require("telescope.builtin").find_files({ + prompt_title = "Wiki Insert Link", cwd = vim.g.wiki_root, - actions = { - -- Override the default selection action - ["default"] = function(selected) - -- Get the selected file (should be just one) - if selected and #selected > 0 then - local full_path = selected[1] - - -- Extract just the filename from the path - local filename = full_path:match("([^/\\]+)$") - - -- Create a wiki link format [[filename]] without the file extension + attach_mappings = function(prompt_bufnr, map) + actions.select_default:replace(function() + local entry = action_state.get_selected_entry() + actions.close(prompt_bufnr) + if entry then + local filename = entry[1]:match("([^/\\]+)$") local link = string.format("[[%s]]", filename:gsub("%.%w+$", "")) - - -- Insert the wiki link at the current cursor position vim.api.nvim_put({ link }, "c", true, true) end - end, - }, + end) + return true + end, }) end, { noremap = true, silent = true, desc = "[W]iki [I]nsert Link" }) @@ -161,7 +155,7 @@ vim.keymap.set("n", "dg", function() end end, { desc = "[D]bt [G]o to ref/source" }) --- dbt: send a raw command string to toggleterm (used by fzf actions) +-- dbt: send a raw command string to toggleterm (used by picker actions) local function dbt_cmd_raw(cmd) local term = require("toggleterm.terminal").get(1) if not term then @@ -181,48 +175,39 @@ vim.keymap.set("n", "df", function() return end - local fzf = require("fzf-lua") - fzf.files({ - prompt = "dbt model ", + local actions = require("telescope.actions") + local action_state = require("telescope.actions.state") + require("telescope.builtin").find_files({ + prompt_title = "dbt model (enter=open, C-r=run, C-b=build, C-t=test)", cwd = root, - cmd = "fd -e sql . models", - actions = { - -- Default: open the file - ["default"] = fzf.actions.file_edit, - -- Ctrl-r: run the selected model - ["ctrl-r"] = function(selected) - if selected and #selected > 0 then - local name = selected[1]:match("([^/]+)%.sql$") - if name then - dbt_cmd_raw("uv run dbt run -s " .. name) - end - end - end, - -- Ctrl-b: build the selected model - ["ctrl-b"] = function(selected) - if selected and #selected > 0 then - local name = selected[1]:match("([^/]+)%.sql$") - if name then - dbt_cmd_raw("uv run dbt build -s " .. name) - end + search_dirs = { "models" }, + find_command = { "fd", "-e", "sql" }, + attach_mappings = function(prompt_bufnr, map) + local function get_model_name() + local entry = action_state.get_selected_entry() + if entry then + return entry[1]:match("([^/]+)%.sql$") end - end, - -- Ctrl-t: test the selected model - ["ctrl-t"] = function(selected) - if selected and #selected > 0 then - local name = selected[1]:match("([^/]+)%.sql$") - if name then - dbt_cmd_raw("uv run dbt test -s " .. name) - end - end - end, - }, - fzf_opts = { - ["--header"] = "enter=open | ctrl-r=run | ctrl-b=build | ctrl-t=test", - ["--multi"] = true, - }, + end + map("i", "", function() + local name = get_model_name() + actions.close(prompt_bufnr) + if name then dbt_cmd_raw("uv run dbt run -s " .. name) end + end) + map("i", "", function() + local name = get_model_name() + actions.close(prompt_bufnr) + if name then dbt_cmd_raw("uv run dbt build -s " .. name) end + end) + map("i", "", function() + local name = get_model_name() + actions.close(prompt_bufnr) + if name then dbt_cmd_raw("uv run dbt test -s " .. name) end + end) + return true + end, }) -end, { desc = "[D]bt [F]ind model (fzf)" }) +end, { desc = "[D]bt [F]ind model" }) -- dbt: open the compiled SQL for the current model in a split vim.keymap.set("n", "do", function() @@ -252,7 +237,7 @@ vim.keymap.set("n", "d/", function() vim.notify("No dbt_project.yml found", vim.log.levels.WARN) return end - require("fzf-lua").grep({ prompt = "dbt grep ", cwd = root .. "/models" }) + require("telescope.builtin").live_grep({ prompt_title = "dbt grep", cwd = root .. "/models" }) end, { desc = "[D]bt search models" }) vim.keymap.set("n", "dr", function() From 35131c3e72bec2770da7acf4312154b25b7bcd9f Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 5 Mar 2026 22:09:47 +0000 Subject: [PATCH 14/26] Add dv to show dbt model output in visidata Pipes `dbt show --output csv --limit 500` into visidata via toggleterm. Formats and saves the file first like other dbt commands. https://claude.ai/code/session_016johXLfEd6P4umaT14YQEQ --- nvim/lua/config/keymaps.lua | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/nvim/lua/config/keymaps.lua b/nvim/lua/config/keymaps.lua index 04a9ea6..353c204 100644 --- a/nvim/lua/config/keymaps.lua +++ b/nvim/lua/config/keymaps.lua @@ -315,6 +315,22 @@ vim.keymap.set("n", "dp", function() }) end, { desc = "[D]bt [P]review model rows" }) +-- dbt: show model output as CSV piped into visidata +vim.keymap.set("n", "dv", function() + local model = dbt_model_name() + if not model then + return + end + local root = dbt_project_root() + if not root then + vim.notify("No dbt_project.yml found", vim.log.levels.WARN) + return + end + require("conform").format({ async = false, lsp_fallback = true }) + vim.cmd("write") + dbt_cmd_raw("cd " .. root .. " && uv run dbt show -s " .. model .. " --limit 500 --output csv | vd -f csv") +end, { desc = "[D]bt [V]isidata preview" }) + -- dbt: read a prompt template from nvim/prompts/ and substitute {{key}} placeholders local function dbt_load_prompt(name, vars) local prompt_dir = vim.fn.stdpath("config") .. "/prompts/" From 4983db09df21156ae0e9ca673e15ea254e02105c Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 5 Mar 2026 22:12:31 +0000 Subject: [PATCH 15/26] Return focus to code buffer after visidata exits Uses a dedicated toggleterm Terminal with close_on_exit and an on_exit callback instead of the shared terminal, so focus returns to the previous window when visidata is quit. https://claude.ai/code/session_016johXLfEd6P4umaT14YQEQ --- nvim/lua/config/keymaps.lua | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/nvim/lua/config/keymaps.lua b/nvim/lua/config/keymaps.lua index 353c204..049650f 100644 --- a/nvim/lua/config/keymaps.lua +++ b/nvim/lua/config/keymaps.lua @@ -328,7 +328,21 @@ vim.keymap.set("n", "dv", function() end require("conform").format({ async = false, lsp_fallback = true }) vim.cmd("write") - dbt_cmd_raw("cd " .. root .. " && uv run dbt show -s " .. model .. " --limit 500 --output csv | vd -f csv") + local prev_win = vim.api.nvim_get_current_win() + local cmd = "cd " .. root .. " && uv run dbt show -s " .. model .. " --limit 500 --output csv | vd -f csv" + require("toggleterm.terminal").Terminal + :new({ + cmd = cmd, + close_on_exit = true, + on_exit = function() + vim.schedule(function() + if vim.api.nvim_win_is_valid(prev_win) then + vim.api.nvim_set_current_win(prev_win) + end + end) + end, + }) + :toggle() end, { desc = "[D]bt [V]isidata preview" }) -- dbt: read a prompt template from nvim/prompts/ and substitute {{key}} placeholders From ed173af977682d4b3034149aa9eb158c2cc64aa6 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 5 Mar 2026 22:20:22 +0000 Subject: [PATCH 16/26] Fix dbt visidata: use json output with json-to-csv conversion dbt show only supports --output json/text, not csv. Use --output json --log-format json and pipe through a python script that extracts the preview data and converts to CSV. https://claude.ai/code/session_016johXLfEd6P4umaT14YQEQ --- nvim/lua/config/keymaps.lua | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/nvim/lua/config/keymaps.lua b/nvim/lua/config/keymaps.lua index 049650f..a4252ec 100644 --- a/nvim/lua/config/keymaps.lua +++ b/nvim/lua/config/keymaps.lua @@ -329,7 +329,30 @@ vim.keymap.set("n", "dv", function() require("conform").format({ async = false, lsp_fallback = true }) vim.cmd("write") local prev_win = vim.api.nvim_get_current_win() - local cmd = "cd " .. root .. " && uv run dbt show -s " .. model .. " --limit 500 --output csv | vd -f csv" + local json_to_csv = [[python3 -c " +import sys, json, csv +raw = sys.stdin.read() +for line in raw.splitlines(): + try: + obj = json.loads(line) + except json.JSONDecodeError: + continue + preview = None + if 'data' in obj and 'preview' in obj['data']: + preview = obj['data']['preview'] + elif 'results' in obj: + preview = obj['results'][0].get('preview') + elif 'preview' in obj: + preview = obj['preview'] + if preview: + if isinstance(preview, str): + preview = json.loads(preview) + w = csv.DictWriter(sys.stdout, fieldnames=preview[0].keys()) + w.writeheader() + w.writerows(preview) + break +"]] + local cmd = "cd " .. root .. " && uv run dbt show -s " .. model .. " --limit 500 --output json --log-format json | " .. json_to_csv .. " | vd -f csv" require("toggleterm.terminal").Terminal :new({ cmd = cmd, From b4fe1b83fb06035fd9ba49660ac1bd607df5d4d7 Mon Sep 17 00:00:00 2001 From: Michael Barton Date: Fri, 6 Mar 2026 13:47:39 -0800 Subject: [PATCH 17/26] Rename dbt prompt files --- {nvim/prompts => dbt}/dbt_deep_analysis.md | 0 {nvim/prompts => dbt}/dbt_quick_analysis.md | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename {nvim/prompts => dbt}/dbt_deep_analysis.md (100%) rename {nvim/prompts => dbt}/dbt_quick_analysis.md (100%) diff --git a/nvim/prompts/dbt_deep_analysis.md b/dbt/dbt_deep_analysis.md similarity index 100% rename from nvim/prompts/dbt_deep_analysis.md rename to dbt/dbt_deep_analysis.md diff --git a/nvim/prompts/dbt_quick_analysis.md b/dbt/dbt_quick_analysis.md similarity index 100% rename from nvim/prompts/dbt_quick_analysis.md rename to dbt/dbt_quick_analysis.md From 054a85593f6ac9affeb12274ee319310d04c032b Mon Sep 17 00:00:00 2001 From: Michael Barton Date: Fri, 6 Mar 2026 13:48:11 -0800 Subject: [PATCH 18/26] Add explicit python scripts for db analsis --- ansible/tasks/neovim.yml | 9 +- dbt/dbt_analyse.py | 109 +++++++++++ dbt/dbt_batch_audit.py | 398 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 513 insertions(+), 3 deletions(-) create mode 100755 dbt/dbt_analyse.py create mode 100644 dbt/dbt_batch_audit.py diff --git a/ansible/tasks/neovim.yml b/ansible/tasks/neovim.yml index ce22212..6e37719 100644 --- a/ansible/tasks/neovim.yml +++ b/ansible/tasks/neovim.yml @@ -15,13 +15,15 @@ - ruff - yamllint -- name: Remove existing ftplugin directory if it exists (to allow symlinking) +- name: Remove existing directories that need to be replaced with symlinks ansible.builtin.file: - path: "{{ ansible_env.HOME }}/.config/nvim/ftplugin" + path: "{{ ansible_env.HOME }}/.config/nvim/{{ item }}" state: absent + loop: + - ftplugin + - dbt - name: Link specific configuration files - ansible.builtin.file: src: "{{ playbook_dir }}/../{{ item.src }}" dest: "{{ ansible_env.HOME }}/{{ item.dest }}" @@ -30,6 +32,7 @@ - { src: "nvim/init.lua", dest: ".config/nvim/init.lua" } - { src: "nvim/lua", dest: ".config/nvim/lua" } - { src: "nvim/ftplugin", dest: ".config/nvim/ftplugin" } + - { src: "dbt", dest: ".config/nvim/dbt" } - name: Install jsonlint node.js package. community.general.npm: diff --git a/dbt/dbt_analyse.py b/dbt/dbt_analyse.py new file mode 100755 index 0000000..baec277 --- /dev/null +++ b/dbt/dbt_analyse.py @@ -0,0 +1,109 @@ +#!/usr/bin/env python3 +""" +dbt_analyse: compile a dbt model, gather context, then launch an interactive +cursor-agent session. Designed to be called from a tmux window. + +Usage: + dbt_analyse.py --model --root \ + --filepath --prompt +""" + +import subprocess +import sys +import glob +import os +import click + + +def run(cmd, cwd=None, capture=False, check=True): + result = subprocess.run( + cmd, + cwd=cwd, + capture_output=capture, + text=True, + ) + if check and result.returncode != 0: + stderr = result.stderr.strip() if result.stderr else "" + click.echo(f"ERROR: command failed (exit {result.returncode}): {' '.join(cmd)}", err=True) + if stderr: + click.echo(stderr, err=True) + sys.exit(result.returncode) + return result + + +@click.command() +@click.option("--model", required=True, help="dbt model name (no extension)") +@click.option("--root", required=True, help="Path to dbt project root") +@click.option("--filepath", required=True, help="Absolute path to the source SQL file") +@click.option("--prompt", required=True, help="Path to the prompt template .md file") +@click.option("--limit", default=20, show_default=True, help="Row limit for dbt show") +@click.option( + "--model-flag", default="sonnet-4.6-thinking", show_default=True, help="cursor-agent model" +) +def main(model, root, filepath, prompt, limit, model_flag): + # --- 1. compile --- + click.echo(f"Compiling {model}...") + run(["uv", "run", "dbt", "compile", "-s", model, "--quiet"], cwd=root) + + # --- 2. find compiled SQL --- + pattern = os.path.join(root, "target", "compiled", "**", f"{model}.sql") + matches = glob.glob(pattern, recursive=True) + if not matches: + click.echo(f"ERROR: no compiled SQL found for {model} — did compile succeed?", err=True) + sys.exit(1) + compiled_sql = open(matches[0]).read() + click.echo(f"Compiled SQL: {matches[0]}") + + # --- 3. sample rows --- + click.echo(f"Fetching sample rows (limit={limit})...") + result = run( + [ + "uv", + "run", + "dbt", + "show", + "-s", + model, + "--limit", + str(limit), + "--output", + "json", + "--log-format", + "json", + ], + cwd=root, + capture=True, + check=False, + ) + if result.returncode != 0: + click.echo( + f"WARNING: dbt show failed (exit {result.returncode}), continuing without sample rows", + err=True, + ) + sample_rows = "(dbt show failed)" + else: + sample_rows = result.stdout.strip() or "(no rows returned)" + + # --- 4. source SQL --- + if not os.path.exists(filepath): + click.echo(f"ERROR: source file not found: {filepath}", err=True) + sys.exit(1) + source_sql = open(filepath).read() + + # --- 5. build prompt --- + if not os.path.exists(prompt): + click.echo(f"ERROR: prompt template not found: {prompt}", err=True) + sys.exit(1) + template = open(prompt).read() + full_prompt = template.replace("{{compiled_sql}}", compiled_sql).replace( + "{{sample_rows}}", sample_rows + ) + full_prompt += f"\n\nSource SQL:\n{source_sql}" + + # --- 6. launch cursor-agent --- + click.echo(f"Launching cursor-agent ({model_flag})...") + os.execlp("cursor-agent", "cursor-agent", "--model", model_flag, full_prompt) + + +if __name__ == "__main__": + main() diff --git a/dbt/dbt_batch_audit.py b/dbt/dbt_batch_audit.py new file mode 100644 index 0000000..04ee536 --- /dev/null +++ b/dbt/dbt_batch_audit.py @@ -0,0 +1,398 @@ +#!/usr/bin/env python3 +# /// script +# requires-python = ">=3.10" +# dependencies = ["click", "mdformat"] +# /// +""" +dbt_batch_audit: run dbt model audits across multiple models and LLMs in +parallel, then synthesize a final consolidated report. + +Accepts SQL file paths, directories, and shell globs. Model names are inferred +from filenames (e.g. models/int_orders.sql → int_orders). + +Usage: + # single files + dbt_batch_audit.py models/int_orders.sql models/stg_users.sql ... + + # a whole directory + dbt_batch_audit.py models/intermediate/ + + # shell glob (expanded by the shell before the script sees it) + dbt_batch_audit.py models/int_*.sql + + # mix and match + dbt_batch_audit.py models/intermediate/ models/stg_special.sql +""" + +import subprocess +import sys +import glob +import os +import time +from concurrent.futures import ThreadPoolExecutor, as_completed + +import click + + +def run(cmd, cwd=None, capture=False, check=True): + result = subprocess.run(cmd, cwd=cwd, capture_output=capture, text=True) + if check and result.returncode != 0: + stderr = result.stderr.strip() if result.stderr else "" + click.echo( + f"ERROR: command failed (exit {result.returncode}): {' '.join(cmd)}", + err=True, + ) + if stderr: + click.echo(stderr, err=True) + sys.exit(result.returncode) + return result + + +def compile_model(model_name, root): + click.echo(f" Compiling {model_name}...") + run(["uv", "run", "dbt", "compile", "-s", model_name, "--quiet"], cwd=root) + pattern = os.path.join(root, "target", "compiled", "**", f"{model_name}.sql") + matches = glob.glob(pattern, recursive=True) + if not matches: + click.echo(f"ERROR: no compiled SQL found for {model_name}", err=True) + sys.exit(1) + with open(matches[0]) as f: + return f.read() + + +def get_sample_rows(model_name, root, limit): + click.echo(f" Fetching sample rows for {model_name} (limit={limit})...") + result = run( + [ + "uv", "run", "dbt", "show", "-s", model_name, + "--limit", str(limit), "--output", "json", "--log-format", "json", + ], + cwd=root, + capture=True, + check=False, + ) + if result.returncode != 0: + click.echo(f" WARNING: dbt show failed for {model_name}", err=True) + return "(dbt show failed)" + return result.stdout.strip() or "(no rows returned)" + + +def write_context_file(output_dir, model_name, template, compiled_sql, sample_rows, source_sql): + """Write the full audit context to a file so cursor-agent can read it.""" + ctx_dir = os.path.join(output_dir, ".context") + os.makedirs(ctx_dir, exist_ok=True) + + content = ( + template + .replace("{{compiled_sql}}", compiled_sql) + .replace("{{sample_rows}}", sample_rows) + ) + content += f"\n\nSource SQL:\n{source_sql}" + + ctx_path = os.path.join(ctx_dir, f"{model_name}__context.md") + with open(ctx_path, "w") as f: + f.write(content) + return os.path.abspath(ctx_path) + + +def get_available_models(): + """Return the set of valid model IDs from cursor-agent --list-models.""" + result = subprocess.run( + ["cursor-agent", "--list-models"], + capture_output=True, + text=True, + ) + # Strip ANSI escape codes, then extract the first token of each line that + # looks like a model ID (alphanumeric + hyphens/dots, before the " - " separator). + import re + ansi_escape = re.compile(r"\x1b\[[0-9;]*[A-Za-z]|\x1b\[[0-9]*[A-Za-z]") + clean = ansi_escape.sub("", result.stdout) + models = set() + for line in clean.splitlines(): + m = re.match(r"^([a-zA-Z0-9][a-zA-Z0-9._-]+)\s+-\s+", line.strip()) + if m: + models.add(m.group(1)) + return models + + +def validate_llms(llms): + """Fail fast if any requested LLM is not in cursor-agent's available models.""" + click.echo("Validating model names against cursor-agent...") + available = get_available_models() + if not available: + click.echo( + "WARNING: could not retrieve available models list; skipping validation", + err=True, + ) + return + invalid = [llm for llm in llms if llm not in available] + if invalid: + click.echo( + f"ERROR: the following model(s) are not available in cursor-agent:\n" + + "\n".join(f" - {m}" for m in invalid) + + f"\n\nAvailable models:\n " + "\n ".join(sorted(available)), + err=True, + ) + sys.exit(1) + click.echo(f" All {len(llms)} model(s) validated OK.\n") + + +def run_audit(model_name, context_path, llm, output_dir, root): + click.echo(f" [{model_name} × {llm}] Starting audit...") + start = time.monotonic() + + prompt = ( + f"Read the audit instructions and dbt model context from {context_path}. " + "Perform a thorough data quality audit of the dbt model as described. " + "Output your complete findings as a well-structured markdown report." + ) + + result = subprocess.run( + ["cursor-agent", "--print", "--force", "--model", llm, prompt], + capture_output=True, + text=True, + cwd=root, + ) + + if result.returncode == 0: + report = result.stdout.strip() + else: + report = f"(audit failed: exit {result.returncode})\n{result.stderr}" + + safe_llm = llm.replace("/", "_").replace(" ", "_") + report_path = os.path.join(output_dir, f"{model_name}__{safe_llm}.md") + with open(report_path, "w") as f: + f.write(f"# Audit: {model_name} (LLM: {llm})\n\n{report}\n") + + elapsed = time.monotonic() - start + click.echo(f" [{model_name} × {llm}] Done ({elapsed:.0f}s) → {report_path}") + return model_name, llm, report + + +def synthesize_reports(reports, synthesis_model, output_dir, root): + click.echo("\nSynthesizing final report...") + start = time.monotonic() + + combined_parts = [] + for model_name, llm, report in reports: + combined_parts.append( + f"---\n## Model: {model_name} | Reviewer: {llm}\n\n{report}\n" + ) + combined_text = "\n".join(combined_parts) + + combined_path = os.path.join(output_dir, "all_individual_reports.md") + with open(combined_path, "w") as f: + f.write(f"# All Individual Audit Reports\n\n{combined_text}\n") + + # Write synthesis context to a file to avoid command-line length limits + ctx_dir = os.path.join(output_dir, ".context") + os.makedirs(ctx_dir, exist_ok=True) + synthesis_ctx_path = os.path.abspath( + os.path.join(ctx_dir, "synthesis_context.md") + ) + + synthesis_instructions = f"""\ +You are a senior analytics engineer reviewing multiple dbt model audit reports. +Each report below was generated by a different LLM auditing a dbt model. + +Your job: +1. Cross-reference findings across models and reviewers +2. Identify the most critical issues that appear consistently +3. Flag any contradictions between reviewers +4. Prioritize recommendations by impact and effort +5. Produce a final consolidated report in markdown + +A key part of your analysis is **cross-model bug propagation**: for every significant finding in any +model, explicitly trace its downstream consequences through the model dependency chain. Ask: which +downstream models consume this model's output? If this bug is present in the data, what does that +mean for each downstream model's correctness? Does the bug amplify, get filtered out, or silently +corrupt aggregations further down the chain? Call out cases where a bug in an upstream model makes +a downstream model's output untrustworthy even if the downstream model itself has no defects. + +Generate a final synthesis report with: +- An executive summary +- Critical findings (agreed upon by multiple reviewers or clearly valid), each with an explicit + **Downstream impact** sub-section tracing the bug through the dependency chain +- A dependency propagation map: a table or diagram showing which bugs flow into which downstream + models and what the compounded effect is +- Model-specific recommendations ordered by severity +- Cross-model patterns or systemic issues +- A prioritized action plan table, where items that fix root-cause bugs affecting multiple + downstream models are ranked higher than equivalent-effort fixes that are locally scoped + +--- + +Individual audit reports: + +{combined_text}""" + + with open(synthesis_ctx_path, "w") as f: + f.write(synthesis_instructions) + + prompt = ( + f"Read the synthesis instructions and individual audit reports from " + f"{synthesis_ctx_path}. Follow the instructions to produce a final " + "consolidated synthesis report in markdown." + ) + + result = subprocess.run( + ["cursor-agent", "--print", "--force", "--model", synthesis_model, prompt], + capture_output=True, + text=True, + cwd=root, + ) + + if result.returncode == 0: + synthesis = result.stdout.strip() + else: + synthesis = ( + f"(synthesis failed: exit {result.returncode})\n{result.stderr}" + ) + + synthesis_path = os.path.join(output_dir, "final_synthesis.md") + with open(synthesis_path, "w") as f: + f.write(f"# dbt Model Audit — Final Synthesis\n\n{synthesis}\n") + + elapsed = time.monotonic() - start + click.echo(f"Final synthesis ({elapsed:.0f}s) → {synthesis_path}") + click.echo(f"Combined reports → {combined_path}") + return synthesis_path + + +def resolve_sql_paths(paths): + """Expand directories and verify that every path is a .sql file.""" + resolved = [] + for p in paths: + p = os.path.abspath(p) + if os.path.isdir(p): + children = sorted( + f for f in glob.glob(os.path.join(p, "**", "*.sql"), recursive=True) + ) + if not children: + click.echo(f"WARNING: no .sql files found in {p}", err=True) + resolved.extend(children) + elif os.path.isfile(p): + if not p.endswith(".sql"): + click.echo(f"ERROR: not a .sql file: {p}", err=True) + sys.exit(1) + resolved.append(p) + else: + click.echo(f"ERROR: path not found: {p}", err=True) + sys.exit(1) + return resolved + + +def model_name_from_path(filepath): + return os.path.splitext(os.path.basename(filepath))[0] + + +@click.command() +@click.argument("paths", nargs=-1, required=True) +@click.option( + "--llm", "llms", required=True, multiple=True, + help="LLM model name for cursor-agent (repeatable)", +) +@click.option("--root", required=True, help="Path to dbt project root") +@click.option("--prompt", required=True, help="Path to the prompt template .md file") +@click.option( + "--output-dir", default="./audit_reports", show_default=True, + help="Directory for output reports", +) +@click.option("--limit", default=20, show_default=True, help="Row limit for dbt show") +@click.option( + "--synthesis-model", default="sonnet-4.6-thinking", show_default=True, + help="LLM for the final synthesis step", +) +@click.option( + "--concurrency", default=3, show_default=True, + help="Max parallel cursor-agent invocations", +) +def main(paths, llms, root, prompt, output_dir, limit, synthesis_model, concurrency): + """Run dbt model audits across multiple models and LLMs, then synthesize. + + PATHS are .sql files, directories containing .sql files, or shell globs. + Model names are inferred from filenames (int_orders.sql → int_orders). + """ + validate_llms(llms + (synthesis_model,)) + + sql_files = resolve_sql_paths(paths) + if not sql_files: + click.echo("ERROR: no .sql files resolved from the given paths", err=True) + sys.exit(1) + + model_specs = [] + seen = set() + for filepath in sql_files: + name = model_name_from_path(filepath) + if name in seen: + click.echo( + f"ERROR: duplicate model name '{name}' from {filepath}", err=True, + ) + sys.exit(1) + seen.add(name) + model_specs.append((name, filepath)) + + if not os.path.exists(prompt): + click.echo(f"ERROR: prompt template not found: {prompt}", err=True) + sys.exit(1) + with open(prompt) as f: + template = f.read() + + root = os.path.abspath(root) + output_dir = os.path.abspath(output_dir) + os.makedirs(output_dir, exist_ok=True) + + total = len(model_specs) * len(llms) + click.echo( + f"Auditing {len(model_specs)} model(s) × {len(llms)} LLM(s) = {total} audit(s)" + ) + click.echo(f"Concurrency: {concurrency} | Synthesis model: {synthesis_model}\n") + + context_paths = {} + for name, filepath in model_specs: + click.echo(f"Preparing {name}...") + compiled_sql = compile_model(name, root) + sample_rows = get_sample_rows(name, root, limit) + with open(filepath) as f: + source_sql = f.read() + context_paths[name] = write_context_file( + output_dir, name, template, compiled_sql, sample_rows, source_sql, + ) + + click.echo(f"\nAll models compiled. Launching {total} audit(s)...\n") + + reports = [] + with ThreadPoolExecutor(max_workers=concurrency) as pool: + futures = {} + for name, _ in model_specs: + for llm in llms: + fut = pool.submit( + run_audit, name, context_paths[name], llm, output_dir, root, + ) + futures[fut] = (name, llm) + + for fut in as_completed(futures): + name, llm = futures[fut] + try: + reports.append(fut.result()) + except Exception as e: + click.echo(f" ERROR [{name} × {llm}]: {e}", err=True) + reports.append((name, llm, f"(error: {e})")) + + reports.sort(key=lambda r: (r[0], r[1])) + + synthesize_reports(reports, synthesis_model, output_dir, root) + + md_files = glob.glob(os.path.join(output_dir, "*.md")) + if md_files: + click.echo(f"\nFormatting {len(md_files)} markdown file(s)...") + run( + ["uvx", "mdformat", "--wrap", "100"] + md_files, + cwd=root, + ) + + click.echo(f"\nDone! {len(reports)} audit(s) completed. Reports in: {output_dir}") + + +if __name__ == "__main__": + main() From b5a745bdc4e6195dcc8dae1b6190010dc497b256 Mon Sep 17 00:00:00 2001 From: Michael Barton Date: Fri, 6 Mar 2026 13:49:00 -0800 Subject: [PATCH 19/26] Update dbt nvim keymaps --- nvim/lua/config/keymaps.lua | 43 +++++++++++++++---------------------- 1 file changed, 17 insertions(+), 26 deletions(-) diff --git a/nvim/lua/config/keymaps.lua b/nvim/lua/config/keymaps.lua index a4252ec..2e13c82 100644 --- a/nvim/lua/config/keymaps.lua +++ b/nvim/lua/config/keymaps.lua @@ -368,9 +368,9 @@ for line in raw.splitlines(): :toggle() end, { desc = "[D]bt [V]isidata preview" }) --- dbt: read a prompt template from nvim/prompts/ and substitute {{key}} placeholders +-- dbt: read a prompt template from the llm/ directory and substitute {{key}} placeholders local function dbt_load_prompt(name, vars) - local prompt_dir = vim.fn.stdpath("config") .. "/prompts/" + local prompt_dir = vim.fn.stdpath("config") .. "/dbt/" local path = prompt_dir .. name .. ".md" local lines = vim.fn.readfile(path) if #lines == 0 then @@ -384,7 +384,7 @@ local function dbt_load_prompt(name, vars) return prompt end --- dbt: run claude agent on current model (quick analysis with sonnet) +-- dbt: run cursor-agent on current model (quick analysis with sonnet) -- Replaces buffer contents with the file + inline SQL comments vim.keymap.set("n", "da", function() local filepath = vim.fn.expand("%:p") @@ -400,7 +400,9 @@ vim.keymap.set("n", "da", function() vim.notify("Running quick analysis...", vim.log.levels.INFO) - local cmd = { "claude", "-p", prompt, "--model", "claude-sonnet-4-6", filepath } + local file_content = table.concat(vim.fn.readfile(filepath), "\n") + local full_prompt = prompt .. "\n\nFile: " .. filepath .. "\n```sql\n" .. file_content .. "\n```" + local cmd = { "cursor-agent", "--print", "--model", "sonnet-4.6", full_prompt } local output = {} vim.fn.jobstart(cmd, { stdout_buffered = true, @@ -412,7 +414,7 @@ vim.keymap.set("n", "da", function() on_exit = function(_, exit_code) vim.schedule(function() if exit_code ~= 0 then - vim.notify("claude exited with code " .. exit_code, vim.log.levels.ERROR) + vim.notify("cursor-agent exited with code " .. exit_code, vim.log.levels.ERROR) return end -- Remove trailing empty strings from jobstart output @@ -420,7 +422,7 @@ vim.keymap.set("n", "da", function() table.remove(output) end if #output == 0 then - vim.notify("No output from claude", vim.log.levels.WARN) + vim.notify("No output from cursor-agent", vim.log.levels.WARN) return end vim.api.nvim_buf_set_lines(bufnr, 0, -1, false, output) @@ -430,7 +432,7 @@ vim.keymap.set("n", "da", function() }) end, { desc = "[D]bt [A]nalyse model (quick)" }) --- dbt: open interactive claude session in a new tmux window with compiled SQL + sample rows as context +-- dbt: open interactive cursor-agent session in a new tmux window with compiled SQL + sample rows as context vim.keymap.set("n", "dA", function() local filepath = vim.fn.expand("%:p") if filepath == "" then @@ -447,24 +449,13 @@ vim.keymap.set("n", "dA", function() return end - -- Open a new tmux window that compiles, gathers context, then starts interactive claude - local prompt_path = vim.fn.stdpath("config") .. "/prompts/dbt_deep_analysis.md" - local cmd = string.format( - [[tmux new-window -n 'dbt:%s' ']] - .. [[uv run dbt compile -s %s --quiet ]] - .. [[&& compiled_sql=$(cat $(find %s/target/compiled -name "%s.sql" | head -1) 2>/dev/null) ]] - .. [[&& sample_rows=$(uv run dbt show -s %s --limit 20 2>/dev/null) ]] - .. [[&& prompt=$(sed -e "s|{{compiled_sql}}|${compiled_sql}|g" -e "s|{{sample_rows}}|${sample_rows}|g" %s) ]] - .. [[&& claude --model claude-sonnet-4-6 --thinking --prompt "${prompt}" %s ]] - .. [[|| read -p "Press enter to close..."']], - model, - model, - vim.fn.shellescape(root), - model, - model, - vim.fn.shellescape(prompt_path), - vim.fn.shellescape(filepath) + -- Open a new tmux window running the standalone dbt_analyse.py script + local prompt_path = vim.fn.stdpath("config") .. "/dbt/dbt_deep_analysis.md" + local script_path = vim.fn.stdpath("config") .. "/dbt/dbt_analyse.py" + local shell_script = string.format( + [[cd %s && uv run python3 %s --model %s --root %s --filepath %s --prompt %s || (echo "Press enter to close..." && read)]], + root, script_path, model, root, filepath, prompt_path ) - vim.fn.system(cmd) - vim.notify("Opened interactive claude session in tmux window 'dbt:" .. model .. "'", vim.log.levels.INFO) + vim.fn.jobstart({ "tmux", "new-window", "-n", "dbt:" .. model, shell_script }, { detach = true }) + vim.notify("Opened interactive cursor-agent session in tmux window 'dbt:" .. model .. "'", vim.log.levels.INFO) end, { desc = "[D]bt [A]nalyse model (interactive)" }) From abd77b073b6368553f400f22b936ae02e5a3a0e8 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 6 Mar 2026 21:56:52 +0000 Subject: [PATCH 20/26] Fix resource leaks, safety, and robustness issues in dbt audit scripts - Close file handles properly in dbt_analyse.py (use `with` statements) - Add PEP 723 inline metadata to dbt_analyse.py for uv compatibility - Move `import re` to module level in dbt_batch_audit.py - Write context to a temp file in dbt_analyse.py to avoid ARG_MAX limits - Add subprocess timeouts (900s audit, 1200s synthesis) to prevent hangs - Order template substitutions to avoid placeholder injection https://claude.ai/code/session_01RHSUYqsWy6xLfAC9tFTy66 --- dbt/dbt_analyse.py | 39 +++++++++++++++++++++++------ dbt/dbt_batch_audit.py | 56 ++++++++++++++++++++++++++++-------------- 2 files changed, 69 insertions(+), 26 deletions(-) diff --git a/dbt/dbt_analyse.py b/dbt/dbt_analyse.py index baec277..4c9a3f4 100755 --- a/dbt/dbt_analyse.py +++ b/dbt/dbt_analyse.py @@ -1,4 +1,8 @@ #!/usr/bin/env python3 +# /// script +# requires-python = ">=3.10" +# dependencies = ["click"] +# /// """ dbt_analyse: compile a dbt model, gather context, then launch an interactive cursor-agent session. Designed to be called from a tmux window. @@ -12,6 +16,8 @@ import sys import glob import os +import tempfile + import click @@ -51,7 +57,8 @@ def main(model, root, filepath, prompt, limit, model_flag): if not matches: click.echo(f"ERROR: no compiled SQL found for {model} — did compile succeed?", err=True) sys.exit(1) - compiled_sql = open(matches[0]).read() + with open(matches[0]) as f: + compiled_sql = f.read() click.echo(f"Compiled SQL: {matches[0]}") # --- 3. sample rows --- @@ -88,21 +95,37 @@ def main(model, root, filepath, prompt, limit, model_flag): if not os.path.exists(filepath): click.echo(f"ERROR: source file not found: {filepath}", err=True) sys.exit(1) - source_sql = open(filepath).read() + with open(filepath) as f: + source_sql = f.read() # --- 5. build prompt --- if not os.path.exists(prompt): click.echo(f"ERROR: prompt template not found: {prompt}", err=True) sys.exit(1) - template = open(prompt).read() - full_prompt = template.replace("{{compiled_sql}}", compiled_sql).replace( - "{{sample_rows}}", sample_rows - ) + with open(prompt) as f: + template = f.read() + # Substitute compiled_sql first; use a sentinel to avoid the compiled SQL + # accidentally containing the {{sample_rows}} placeholder. + full_prompt = template.replace("{{compiled_sql}}", compiled_sql) + full_prompt = full_prompt.replace("{{sample_rows}}", sample_rows) full_prompt += f"\n\nSource SQL:\n{source_sql}" - # --- 6. launch cursor-agent --- + # --- 6. write context to a temp file & launch cursor-agent --- + # Avoids OS arg-length limits (ARG_MAX) when compiled SQL + sample rows + # are large. + ctx = tempfile.NamedTemporaryFile( + mode="w", suffix=".md", prefix=f"dbt_audit_{model}_", delete=False, + ) + ctx.write(full_prompt) + ctx.close() + click.echo(f"Context written to {ctx.name}") + click.echo(f"Launching cursor-agent ({model_flag})...") - os.execlp("cursor-agent", "cursor-agent", "--model", model_flag, full_prompt) + agent_prompt = ( + f"Read the audit instructions and dbt model context from {ctx.name}. " + "Perform a thorough data quality audit of the dbt model as described." + ) + os.execlp("cursor-agent", "cursor-agent", "--model", model_flag, agent_prompt) if __name__ == "__main__": diff --git a/dbt/dbt_batch_audit.py b/dbt/dbt_batch_audit.py index 04ee536..15eb36b 100644 --- a/dbt/dbt_batch_audit.py +++ b/dbt/dbt_batch_audit.py @@ -24,6 +24,7 @@ dbt_batch_audit.py models/intermediate/ models/stg_special.sql """ +import re import subprocess import sys import glob @@ -82,11 +83,10 @@ def write_context_file(output_dir, model_name, template, compiled_sql, sample_ro ctx_dir = os.path.join(output_dir, ".context") os.makedirs(ctx_dir, exist_ok=True) - content = ( - template - .replace("{{compiled_sql}}", compiled_sql) - .replace("{{sample_rows}}", sample_rows) - ) + # Substitute compiled_sql first to avoid the compiled SQL accidentally + # containing the {{sample_rows}} placeholder. + content = template.replace("{{compiled_sql}}", compiled_sql) + content = content.replace("{{sample_rows}}", sample_rows) content += f"\n\nSource SQL:\n{source_sql}" ctx_path = os.path.join(ctx_dir, f"{model_name}__context.md") @@ -104,7 +104,6 @@ def get_available_models(): ) # Strip ANSI escape codes, then extract the first token of each line that # looks like a model ID (alphanumeric + hyphens/dots, before the " - " separator). - import re ansi_escape = re.compile(r"\x1b\[[0-9;]*[A-Za-z]|\x1b\[[0-9]*[A-Za-z]") clean = ansi_escape.sub("", result.stdout) models = set() @@ -147,12 +146,24 @@ def run_audit(model_name, context_path, llm, output_dir, root): "Output your complete findings as a well-structured markdown report." ) - result = subprocess.run( - ["cursor-agent", "--print", "--force", "--model", llm, prompt], - capture_output=True, - text=True, - cwd=root, - ) + try: + result = subprocess.run( + ["cursor-agent", "--print", "--force", "--model", llm, prompt], + capture_output=True, + text=True, + cwd=root, + timeout=900, + ) + except subprocess.TimeoutExpired: + report = "(audit timed out after 900s)" + click.echo(f" [{model_name} × {llm}] Timed out", err=True) + safe_llm = llm.replace("/", "_").replace(" ", "_") + report_path = os.path.join(output_dir, f"{model_name}__{safe_llm}.md") + with open(report_path, "w") as f: + f.write(f"# Audit: {model_name} (LLM: {llm})\n\n{report}\n") + elapsed = time.monotonic() - start + click.echo(f" [{model_name} × {llm}] Done ({elapsed:.0f}s) → {report_path}") + return model_name, llm, report if result.returncode == 0: report = result.stdout.strip() @@ -235,12 +246,21 @@ def synthesize_reports(reports, synthesis_model, output_dir, root): "consolidated synthesis report in markdown." ) - result = subprocess.run( - ["cursor-agent", "--print", "--force", "--model", synthesis_model, prompt], - capture_output=True, - text=True, - cwd=root, - ) + try: + result = subprocess.run( + ["cursor-agent", "--print", "--force", "--model", synthesis_model, prompt], + capture_output=True, + text=True, + cwd=root, + timeout=1200, + ) + except subprocess.TimeoutExpired: + synthesis = "(synthesis timed out after 1200s)" + click.echo("WARNING: synthesis timed out", err=True) + synthesis_path = os.path.join(output_dir, "final_synthesis.md") + with open(synthesis_path, "w") as f: + f.write(f"# dbt Model Audit — Final Synthesis\n\n{synthesis}\n") + return synthesis_path if result.returncode == 0: synthesis = result.stdout.strip() From bd959492154d11d2a3f35a8d16a3211091765d90 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 6 Mar 2026 22:43:30 +0000 Subject: [PATCH 21/26] Enrich dbt audit with lineage, test coverage, and structured prompts - Rewrite dbt_deep_analysis.md with a comprehensive 8-section audit checklist covering schema/types, join correctness, filters, grain, data quality, performance, test coverage gaps, and upstream risks - Add structured output format (findings table, evidence queries, suggested dbt test YAML snippets) - Add conditional template sections ({{#if lineage}}, {{#if existing_tests}}) with a lightweight Handlebars-style renderer in both scripts - Gather model lineage (parents/children) via `dbt ls` selectors - Scan schema.yml files for existing test definitions and include them so the LLM focuses on coverage gaps rather than redundant suggestions - Add pyyaml dependency to both scripts' PEP 723 metadata https://claude.ai/code/session_01RHSUYqsWy6xLfAC9tFTy66 --- dbt/dbt_analyse.py | 145 +++++++++++++++++++++++++++++++++------ dbt/dbt_batch_audit.py | 93 +++++++++++++++++++++++-- dbt/dbt_deep_analysis.md | 97 +++++++++++++++++++++++++- 3 files changed, 305 insertions(+), 30 deletions(-) diff --git a/dbt/dbt_analyse.py b/dbt/dbt_analyse.py index 4c9a3f4..91ba2e8 100755 --- a/dbt/dbt_analyse.py +++ b/dbt/dbt_analyse.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # /// script # requires-python = ">=3.10" -# dependencies = ["click"] +# dependencies = ["click", "pyyaml"] # /// """ dbt_analyse: compile a dbt model, gather context, then launch an interactive @@ -12,13 +12,16 @@ --filepath --prompt """ +import json import subprocess import sys import glob import os +import re import tempfile import click +import yaml def run(cmd, cwd=None, capture=False, check=True): @@ -37,6 +40,107 @@ def run(cmd, cwd=None, capture=False, check=True): return result +def get_lineage(model, root): + """Return a summary of immediate parents and children from dbt ls.""" + lines = [] + for direction, selector in [("parents", f"+{model},1+{model}"), ("children", f"{model}+,{model}1+")]: + result = subprocess.run( + ["uv", "run", "dbt", "ls", "-s", selector, "--output", "name", "--quiet"], + capture_output=True, text=True, cwd=root, + ) + if result.returncode == 0: + names = [n.strip() for n in result.stdout.strip().splitlines() if n.strip() and n.strip() != model] + if names: + lines.append(f"**{direction.title()}:** {', '.join(names)}") + return "\n".join(lines) if lines else "" + + +def get_existing_tests(model, root): + """Extract test definitions for a model from schema.yml files.""" + schema_files = glob.glob(os.path.join(root, "**", "schema.yml"), recursive=True) + schema_files += glob.glob(os.path.join(root, "**", "_schema.yml"), recursive=True) + schema_files += glob.glob(os.path.join(root, "**", f"*_models.yml"), recursive=True) + schema_files += glob.glob(os.path.join(root, "**", f"*.yml"), recursive=True) + # Deduplicate while preserving order + seen = set() + unique_files = [] + for f in schema_files: + if f not in seen: + seen.add(f) + unique_files.append(f) + + tests = [] + for schema_path in unique_files: + try: + with open(schema_path) as f: + doc = yaml.safe_load(f) + except (yaml.YAMLError, OSError): + continue + if not isinstance(doc, dict): + continue + for m in doc.get("models", []): + if not isinstance(m, dict) or m.get("name") != model: + continue + # Model-level tests + for t in m.get("tests", []): + tests.append(f"- model-level: {t}") + # Column-level tests + for col in m.get("columns", []): + if not isinstance(col, dict): + continue + col_name = col.get("name", "?") + for t in col.get("tests", []): + if isinstance(t, str): + tests.append(f"- {col_name}: {t}") + elif isinstance(t, dict): + tests.append(f"- {col_name}: {t}") + return "\n".join(tests) if tests else "" + + +def get_data_profile(model, root): + """Run a profiling query via dbt show to get column stats.""" + # Use an inline query that profiles the model's output + profile_sql = f""" + {{% set cols = adapter.get_columns_in_relation(ref('{model}')) %}} + SELECT + {{% for col in cols %}} + '{{{ col.name }}}' AS column_name_{{{{ loop.index }}}}, + COUNT(*) AS total_rows_{{{{ loop.index }}}}, + COUNT("{{{ col.name }}}") AS non_null_{{{{ loop.index }}}}, + COUNT(DISTINCT "{{{ col.name }}}") AS distinct_{{{{ loop.index }}}} + {{% if not loop.last %}},{{% endif %}} + {{% endfor %}} + FROM {{{{ ref('{model}') }}}} + """ + # Simpler approach: ask dbt to show the model with a higher limit and + # compute stats from sample rows in the prompt. The LLM has database + # access and can run profiling queries itself. We just nudge it. + return "" + + +def render_template(template, replacements): + """Replace template placeholders, handling conditional {{#if}}/{{^if}} blocks.""" + for key, value in replacements.items(): + # Handle {{#if key}}...{{/if}} blocks + if_pattern = re.compile( + r"\{\{#if " + re.escape(key) + r"\}\}(.*?)\{\{/if\}\}", + re.DOTALL, + ) + not_pattern = re.compile( + r"\{\{\^if " + re.escape(key) + r"\}\}(.*?)\{\{/if\}\}", + re.DOTALL, + ) + if value: + template = if_pattern.sub(r"\1", template) + template = not_pattern.sub("", template) + else: + template = if_pattern.sub("", template) + template = not_pattern.sub(r"\1", template) + # Replace the simple placeholder + template = template.replace("{{" + key + "}}", value) + return template + + @click.command() @click.option("--model", required=True, help="dbt model name (no extension)") @click.option("--root", required=True, help="Path to dbt project root") @@ -65,18 +169,8 @@ def main(model, root, filepath, prompt, limit, model_flag): click.echo(f"Fetching sample rows (limit={limit})...") result = run( [ - "uv", - "run", - "dbt", - "show", - "-s", - model, - "--limit", - str(limit), - "--output", - "json", - "--log-format", - "json", + "uv", "run", "dbt", "show", "-s", model, + "--limit", str(limit), "--output", "json", "--log-format", "json", ], cwd=root, capture=True, @@ -98,21 +192,30 @@ def main(model, root, filepath, prompt, limit, model_flag): with open(filepath) as f: source_sql = f.read() - # --- 5. build prompt --- + # --- 5. gather lineage and existing tests --- + click.echo("Gathering model lineage...") + lineage = get_lineage(model, root) + + click.echo("Scanning for existing dbt tests...") + existing_tests = get_existing_tests(model, root) + + # --- 6. build prompt --- if not os.path.exists(prompt): click.echo(f"ERROR: prompt template not found: {prompt}", err=True) sys.exit(1) with open(prompt) as f: template = f.read() - # Substitute compiled_sql first; use a sentinel to avoid the compiled SQL - # accidentally containing the {{sample_rows}} placeholder. - full_prompt = template.replace("{{compiled_sql}}", compiled_sql) - full_prompt = full_prompt.replace("{{sample_rows}}", sample_rows) + + full_prompt = render_template(template, { + "compiled_sql": compiled_sql, + "sample_rows": sample_rows, + "existing_tests": existing_tests, + "lineage": lineage, + "data_profile": "", + }) full_prompt += f"\n\nSource SQL:\n{source_sql}" - # --- 6. write context to a temp file & launch cursor-agent --- - # Avoids OS arg-length limits (ARG_MAX) when compiled SQL + sample rows - # are large. + # --- 7. write context to a temp file & launch cursor-agent --- ctx = tempfile.NamedTemporaryFile( mode="w", suffix=".md", prefix=f"dbt_audit_{model}_", delete=False, ) diff --git a/dbt/dbt_batch_audit.py b/dbt/dbt_batch_audit.py index 15eb36b..6fca44a 100644 --- a/dbt/dbt_batch_audit.py +++ b/dbt/dbt_batch_audit.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # /// script # requires-python = ">=3.10" -# dependencies = ["click", "mdformat"] +# dependencies = ["click", "mdformat", "pyyaml"] # /// """ dbt_batch_audit: run dbt model audits across multiple models and LLMs in @@ -33,6 +33,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed import click +import yaml def run(cmd, cwd=None, capture=False, check=True): @@ -78,15 +79,90 @@ def get_sample_rows(model_name, root, limit): return result.stdout.strip() or "(no rows returned)" -def write_context_file(output_dir, model_name, template, compiled_sql, sample_rows, source_sql): +def get_lineage(model_name, root): + """Return a summary of immediate parents and children from dbt ls.""" + lines = [] + for direction, selector in [("parents", f"+{model_name},1+{model_name}"), ("children", f"{model_name}+,{model_name}1+")]: + result = subprocess.run( + ["uv", "run", "dbt", "ls", "-s", selector, "--output", "name", "--quiet"], + capture_output=True, text=True, cwd=root, + ) + if result.returncode == 0: + names = [n.strip() for n in result.stdout.strip().splitlines() if n.strip() and n.strip() != model_name] + if names: + lines.append(f"**{direction.title()}:** {', '.join(names)}") + return "\n".join(lines) if lines else "" + + +def get_existing_tests(model_name, root): + """Extract test definitions for a model from schema.yml files.""" + schema_files = glob.glob(os.path.join(root, "**", "*.yml"), recursive=True) + seen = set() + unique_files = [] + for f in schema_files: + if f not in seen: + seen.add(f) + unique_files.append(f) + + tests = [] + for schema_path in unique_files: + try: + with open(schema_path) as f: + doc = yaml.safe_load(f) + except (yaml.YAMLError, OSError): + continue + if not isinstance(doc, dict): + continue + for m in doc.get("models", []): + if not isinstance(m, dict) or m.get("name") != model_name: + continue + for t in m.get("tests", []): + tests.append(f"- model-level: {t}") + for col in m.get("columns", []): + if not isinstance(col, dict): + continue + col_name = col.get("name", "?") + for t in col.get("tests", []): + if isinstance(t, str): + tests.append(f"- {col_name}: {t}") + elif isinstance(t, dict): + tests.append(f"- {col_name}: {t}") + return "\n".join(tests) if tests else "" + + +def render_template(template, replacements): + """Replace template placeholders, handling conditional {{#if}}/{{^if}} blocks.""" + for key, value in replacements.items(): + if_pattern = re.compile( + r"\{\{#if " + re.escape(key) + r"\}\}(.*?)\{\{/if\}\}", + re.DOTALL, + ) + not_pattern = re.compile( + r"\{\{\^if " + re.escape(key) + r"\}\}(.*?)\{\{/if\}\}", + re.DOTALL, + ) + if value: + template = if_pattern.sub(r"\1", template) + template = not_pattern.sub("", template) + else: + template = if_pattern.sub("", template) + template = not_pattern.sub(r"\1", template) + template = template.replace("{{" + key + "}}", value) + return template + + +def write_context_file(output_dir, model_name, template, compiled_sql, sample_rows, source_sql, lineage="", existing_tests=""): """Write the full audit context to a file so cursor-agent can read it.""" ctx_dir = os.path.join(output_dir, ".context") os.makedirs(ctx_dir, exist_ok=True) - # Substitute compiled_sql first to avoid the compiled SQL accidentally - # containing the {{sample_rows}} placeholder. - content = template.replace("{{compiled_sql}}", compiled_sql) - content = content.replace("{{sample_rows}}", sample_rows) + content = render_template(template, { + "compiled_sql": compiled_sql, + "sample_rows": sample_rows, + "existing_tests": existing_tests, + "lineage": lineage, + "data_profile": "", + }) content += f"\n\nSource SQL:\n{source_sql}" ctx_path = os.path.join(ctx_dir, f"{model_name}__context.md") @@ -375,8 +451,13 @@ def main(paths, llms, root, prompt, output_dir, limit, synthesis_model, concurre sample_rows = get_sample_rows(name, root, limit) with open(filepath) as f: source_sql = f.read() + click.echo(f" Gathering lineage for {name}...") + lineage = get_lineage(name, root) + click.echo(f" Scanning tests for {name}...") + existing_tests = get_existing_tests(name, root) context_paths[name] = write_context_file( output_dir, name, template, compiled_sql, sample_rows, source_sql, + lineage=lineage, existing_tests=existing_tests, ) click.echo(f"\nAll models compiled. Launching {total} audit(s)...\n") diff --git a/dbt/dbt_deep_analysis.md b/dbt/dbt_deep_analysis.md index 397ca3d..fbc10d4 100644 --- a/dbt/dbt_deep_analysis.md +++ b/dbt/dbt_deep_analysis.md @@ -1,7 +1,98 @@ -You have access to a duckdb database. Interrogate the database to understand the schema and data, then cross-reference with this dbt model. Check for: data quality issues, join correctness, missing filters, column type mismatches, and potential improvements. Run queries to validate assumptions. +You have access to a duckdb database. You are auditing a dbt model for data quality, correctness, and best practices. Interrogate the database to validate every claim you make — do not speculate without running a query first. -Here is the compiled SQL: +## Audit checklist + +Work through each section. For every finding, run a query to confirm it. + +### 1. Schema & types +- Are column types appropriate (e.g. dates stored as DATE not VARCHAR, monetary values as DECIMAL not FLOAT)? +- Are there implicit casts in joins or WHERE clauses that could silently drop rows or change values? +- Do any columns contain mixed types or unexpected NULLs? + +### 2. Join correctness +- Is every join relationship correct (1:1, 1:N, M:N)? Run a query: does the join **fan out** (produce more rows than the driving table)? +- Are there orphaned rows (LEFT JOIN misses)? What fraction of rows have NULL foreign keys after the join? +- Are join keys unique on the side that should be unique? Query `COUNT(*) vs COUNT(DISTINCT key)`. + +### 3. Filters & business logic +- Are there WHERE / HAVING filters that could silently exclude valid records (e.g. filtering on a column that is sometimes NULL)? +- Is there business logic (CASE statements, date arithmetic, aggregations) that could produce wrong results on edge cases? +- Are date boundaries inclusive/exclusive as intended? + +### 4. Grain & uniqueness +- What is the intended grain of this model? Verify with `COUNT(*) vs COUNT(DISTINCT )`. +- Could the model produce duplicate rows under any upstream data condition? + +### 5. Data quality +- What percentage of each column is NULL? Flag any column where the NULL rate is suspicious. +- Are there unexpected duplicate values, negative numbers, future dates, or empty strings where there shouldn't be? +- Do value distributions look reasonable (run MIN, MAX, AVG, percentiles for numeric columns)? + +### 6. Performance & best practices +- Are there SELECT * or unnecessary columns being carried through? +- Could CTEs be simplified or combined? +- Are there window functions that could be replaced with simpler aggregations, or vice versa? +- Is the model incremental where it should be, or full-refresh where incremental would be better? + +### 7. Test coverage gaps +{{#if existing_tests}} +The following dbt tests are already defined for this model: +{{existing_tests}} + +Identify what is NOT covered by existing tests. Focus recommendations on gaps. +{{/if}} +{{^if existing_tests}} +No dbt tests were found for this model. Recommend the most important tests to add. +{{/if}} + +### 8. Upstream dependency risks +{{#if lineage}} +Model lineage (immediate upstream/downstream): +{{lineage}} + +Consider: if an upstream model delivers late, delivers duplicates, or changes its grain, how does this model behave? Are there defensive checks? +{{/if}} + +## Context + +### Compiled SQL {{compiled_sql}} -Here are the first 20 rows returned by this model: +### Sample rows {{sample_rows}} + +### Data profile +{{#if data_profile}} +{{data_profile}} +{{/if}} + +## Output format + +Structure your report as follows: + +``` +## Executive summary +(2-3 sentences: overall health, most critical finding) + +## Critical findings +(Issues that could produce wrong numbers in production) + +| # | Finding | Severity | Evidence query | Affected columns | +|---|---------|----------|---------------|-----------------| +| 1 | ... | ... | ... | ... | + +### Finding 1: +**Query:** <the SQL you ran> +**Result:** <what you found> +**Impact:** <what goes wrong downstream> +**Fix:** <specific SQL or config change> + +## Warnings +(Issues that aren't wrong today but are fragile) + +## Recommendations +(Best-practice improvements, ordered by impact) + +## Suggested dbt tests +(Specific test YAML snippets to add) +``` From 0c6912ba7759ec102b27a51225f5594e413f80f7 Mon Sep 17 00:00:00 2001 From: Claude <noreply@anthropic.com> Date: Fri, 6 Mar 2026 22:46:45 +0000 Subject: [PATCH 22/26] Clean up dead code in dbt_analyse.py - Remove unused `import json` - Remove stub `get_data_profile` that always returned empty string - Simplify redundant glob patterns in get_existing_tests (*.yml already covers schema.yml, _schema.yml, *_models.yml) https://claude.ai/code/session_01RHSUYqsWy6xLfAC9tFTy66 --- dbt/dbt_analyse.py | 33 +-------------------------------- 1 file changed, 1 insertion(+), 32 deletions(-) diff --git a/dbt/dbt_analyse.py b/dbt/dbt_analyse.py index 91ba2e8..f065d22 100755 --- a/dbt/dbt_analyse.py +++ b/dbt/dbt_analyse.py @@ -12,7 +12,6 @@ --filepath <source_sql_path> --prompt <prompt_template_path> """ -import json import subprocess import sys import glob @@ -57,17 +56,7 @@ def get_lineage(model, root): def get_existing_tests(model, root): """Extract test definitions for a model from schema.yml files.""" - schema_files = glob.glob(os.path.join(root, "**", "schema.yml"), recursive=True) - schema_files += glob.glob(os.path.join(root, "**", "_schema.yml"), recursive=True) - schema_files += glob.glob(os.path.join(root, "**", f"*_models.yml"), recursive=True) - schema_files += glob.glob(os.path.join(root, "**", f"*.yml"), recursive=True) - # Deduplicate while preserving order - seen = set() - unique_files = [] - for f in schema_files: - if f not in seen: - seen.add(f) - unique_files.append(f) + unique_files = glob.glob(os.path.join(root, "**", "*.yml"), recursive=True) tests = [] for schema_path in unique_files: @@ -97,26 +86,6 @@ def get_existing_tests(model, root): return "\n".join(tests) if tests else "" -def get_data_profile(model, root): - """Run a profiling query via dbt show to get column stats.""" - # Use an inline query that profiles the model's output - profile_sql = f""" - {{% set cols = adapter.get_columns_in_relation(ref('{model}')) %}} - SELECT - {{% for col in cols %}} - '{{{ col.name }}}' AS column_name_{{{{ loop.index }}}}, - COUNT(*) AS total_rows_{{{{ loop.index }}}}, - COUNT("{{{ col.name }}}") AS non_null_{{{{ loop.index }}}}, - COUNT(DISTINCT "{{{ col.name }}}") AS distinct_{{{{ loop.index }}}} - {{% if not loop.last %}},{{% endif %}} - {{% endfor %}} - FROM {{{{ ref('{model}') }}}} - """ - # Simpler approach: ask dbt to show the model with a higher limit and - # compute stats from sample rows in the prompt. The LLM has database - # access and can run profiling queries itself. We just nudge it. - return "" - def render_template(template, replacements): """Replace template placeholders, handling conditional {{#if}}/{{^if}} blocks.""" From 9eb9e439a3baee1dd9d751c49a814fafd80f0abc Mon Sep 17 00:00:00 2001 From: Claude <noreply@anthropic.com> Date: Fri, 6 Mar 2026 22:56:23 +0000 Subject: [PATCH 23/26] Split dbt nvim keymaps into their own file Move all dbt helpers and keymaps from keymaps.lua into a dedicated config/dbt.lua, loaded via require("config.dbt"). Keeps keymaps.lua focused on general-purpose bindings. https://claude.ai/code/session_01RHSUYqsWy6xLfAC9tFTy66 --- nvim/lua/config/dbt.lua | 409 ++++++++++++++++++++++++++++++++++++ nvim/lua/config/keymaps.lua | 409 +----------------------------------- 2 files changed, 411 insertions(+), 407 deletions(-) create mode 100644 nvim/lua/config/dbt.lua diff --git a/nvim/lua/config/dbt.lua b/nvim/lua/config/dbt.lua new file mode 100644 index 0000000..962891b --- /dev/null +++ b/nvim/lua/config/dbt.lua @@ -0,0 +1,409 @@ +-- dbt keymaps and helpers +-- Loaded from keymaps.lua + +-- Extract model name from current file path (e.g., models/staging/stg_orders.sql -> stg_orders) +local function dbt_model_name() + local filepath = vim.fn.expand("%:t:r") + if filepath == "" then + vim.notify("No file open", vim.log.levels.WARN) + return nil + end + return filepath +end + +-- Format the current SQL file, then send a dbt command to the terminal +local function dbt_cmd(cmd_template) + local model = dbt_model_name() + if not model then + return + end + + -- Format with conform (sqlfmt), then save + require("conform").format({ async = false, lsp_fallback = true }) + vim.cmd("write") + + -- Build the command + local cmd = string.format(cmd_template, model) + + -- Send to toggleterm (terminal 1), then return focus to the code window + local prev_win = vim.api.nvim_get_current_win() + local term = require("toggleterm.terminal").get(1) + if not term then + term = require("toggleterm.terminal").Terminal:new({ id = 1 }) + end + if not term:is_open() then + term:toggle() + end + term:send(cmd) + vim.api.nvim_set_current_win(prev_win) +end + +-- Find the project root (directory containing dbt_project.yml) +local function dbt_project_root() + local path = vim.fn.findfile("dbt_project.yml", ".;") + if path == "" then + return nil + end + return vim.fn.fnamemodify(path, ":p:h") +end + +-- Send a raw command string to toggleterm (used by picker actions) +local function dbt_cmd_raw(cmd) + local term = require("toggleterm.terminal").get(1) + if not term then + term = require("toggleterm.terminal").Terminal:new({ id = 1 }) + end + if not term:is_open() then + term:toggle() + end + term:send(cmd) +end + +-- Read a prompt template and substitute {{key}} placeholders +local function dbt_load_prompt(name, vars) + local prompt_dir = vim.fn.stdpath("config") .. "/dbt/" + local path = prompt_dir .. name .. ".md" + local lines = vim.fn.readfile(path) + if #lines == 0 then + vim.notify("Prompt not found: " .. path, vim.log.levels.ERROR) + return nil + end + local prompt = table.concat(lines, "\n") + for key, value in pairs(vars or {}) do + prompt = prompt:gsub("{{" .. key .. "}}", value) + end + return prompt +end + +-- Jump to model under cursor from {{ ref('model_name') }} or {{ source('src', 'table') }} +vim.keymap.set("n", "<leader>dg", function() + local line = vim.api.nvim_get_current_line() + local col = vim.api.nvim_win_get_cursor(0)[2] + 1 + + -- Try to find ref('model') or ref("model") around cursor + local ref_model = nil + for start_pos, name, end_pos in line:gmatch("()ref%(['\"]([^'\"]+)['\"]%)()" ) do + if col >= start_pos and col <= end_pos then + ref_model = name + break + end + end + + -- Try source('source_name', 'table_name') if no ref found + local source_name, source_table = nil, nil + if not ref_model then + for start_pos, src, tbl, end_pos in line:gmatch("()source%(['\"]([^'\"]+)['\"]%s*,%s*['\"]([^'\"]+)['\"]%)()" ) do + if col >= start_pos and col <= end_pos then + source_name, source_table = src, tbl + break + end + end + end + + if not ref_model and not source_table then + vim.notify("No ref() or source() under cursor", vim.log.levels.WARN) + return + end + + local root = dbt_project_root() + if not root then + vim.notify("No dbt_project.yml found", vim.log.levels.WARN) + return + end + + -- Search for the model file + local search_name = ref_model or source_table + local matches = vim.fn.globpath(root, "**/" .. search_name .. ".sql", false, true) + if #matches == 0 then + -- Also try .yml for source definitions + matches = vim.fn.globpath(root, "**/" .. search_name .. ".yml", false, true) + end + + if #matches == 1 then + vim.cmd.edit(matches[1]) + elseif #matches > 1 then + vim.ui.select(matches, { prompt = "Multiple matches:" }, function(choice) + if choice then + vim.cmd.edit(choice) + end + end) + else + vim.notify("No file found for: " .. search_name, vim.log.levels.WARN) + end +end, { desc = "[D]bt [G]o to ref/source" }) + +-- Fuzzy model picker — select a model then choose an action +vim.keymap.set("n", "<leader>df", function() + local root = dbt_project_root() + if not root then + vim.notify("No dbt_project.yml found", vim.log.levels.WARN) + return + end + + local actions = require("telescope.actions") + local action_state = require("telescope.actions.state") + require("telescope.builtin").find_files({ + prompt_title = "dbt model (enter=open, C-r=run, C-b=build, C-t=test)", + cwd = root, + search_dirs = { "models" }, + find_command = { "fd", "-e", "sql" }, + attach_mappings = function(prompt_bufnr, map) + local function get_model_name() + local entry = action_state.get_selected_entry() + if entry then + return entry[1]:match("([^/]+)%.sql$") + end + end + map("i", "<C-r>", function() + local name = get_model_name() + actions.close(prompt_bufnr) + if name then dbt_cmd_raw("uv run dbt run -s " .. name) end + end) + map("i", "<C-b>", function() + local name = get_model_name() + actions.close(prompt_bufnr) + if name then dbt_cmd_raw("uv run dbt build -s " .. name) end + end) + map("i", "<C-t>", function() + local name = get_model_name() + actions.close(prompt_bufnr) + if name then dbt_cmd_raw("uv run dbt test -s " .. name) end + end) + return true + end, + }) +end, { desc = "[D]bt [F]ind model" }) + +-- Open the compiled SQL for the current model in a split +vim.keymap.set("n", "<leader>do", function() + local model = dbt_model_name() + if not model then + return + end + local root = dbt_project_root() + if not root then + vim.notify("No dbt_project.yml found", vim.log.levels.WARN) + return + end + local compiled = vim.fn.globpath(root, "target/compiled/**/" .. model .. ".sql", false, true) + if #compiled == 0 then + vim.notify("No compiled SQL found — run dbt compile first", vim.log.levels.WARN) + return + end + vim.cmd("vsplit " .. compiled[1]) + vim.bo.readonly = true + vim.bo.modifiable = false +end, { desc = "[D]bt [O]pen compiled SQL" }) + +-- Grep across all models (search for column names, CTEs, etc.) +vim.keymap.set("n", "<leader>d/", function() + local root = dbt_project_root() + if not root then + vim.notify("No dbt_project.yml found", vim.log.levels.WARN) + return + end + require("telescope.builtin").live_grep({ prompt_title = "dbt grep", cwd = root .. "/models" }) +end, { desc = "[D]bt search models" }) + +vim.keymap.set("n", "<leader>dr", function() + dbt_cmd("uv run dbt run -s %s") +end, { desc = "[D]bt [R]un current model" }) + +vim.keymap.set("n", "<leader>dR", function() + dbt_cmd("uv run dbt run -s %s+") +end, { desc = "[D]bt [R]un model + downstream" }) + +vim.keymap.set("n", "<leader>db", function() + dbt_cmd("uv run dbt build -s %s") +end, { desc = "[D]bt [B]uild current model (run + test)" }) + +vim.keymap.set("n", "<leader>dc", function() + dbt_cmd("uv run dbt compile -s %s") +end, { desc = "[D]bt [C]ompile current model" }) + +vim.keymap.set("n", "<leader>dt", function() + dbt_cmd("uv run dbt test -s %s") +end, { desc = "[D]bt [T]est current model" }) + +vim.keymap.set("n", "<leader>ds", function() + dbt_cmd("uv run dbt show -s %s") +end, { desc = "[D]bt [S]how preview results" }) + +-- Preview sample rows in a horizontal split +vim.keymap.set("n", "<leader>dp", function() + local model = dbt_model_name() + if not model then + return + end + vim.notify("Fetching preview for " .. model .. "...", vim.log.levels.INFO) + local cmd = { "uv", "run", "dbt", "show", "-s", model, "--limit", "20" } + local output = {} + vim.fn.jobstart(cmd, { + cwd = dbt_project_root(), + stdout_buffered = true, + stderr_buffered = true, + on_stdout = function(_, data) + if data then + vim.list_extend(output, data) + end + end, + on_exit = function(_, exit_code) + vim.schedule(function() + if exit_code ~= 0 then + vim.notify("dbt show failed (exit " .. exit_code .. ")", vim.log.levels.ERROR) + return + end + -- Trim trailing empty lines + while #output > 0 and output[#output] == "" do + table.remove(output) + end + if #output == 0 then + vim.notify("No rows returned", vim.log.levels.WARN) + return + end + -- Open a scratch buffer in a horizontal split + vim.cmd("botright new") + local buf = vim.api.nvim_get_current_buf() + vim.bo[buf].buftype = "nofile" + vim.bo[buf].bufhidden = "wipe" + vim.bo[buf].swapfile = false + vim.bo[buf].filetype = "sql" + vim.api.nvim_buf_set_name(buf, "dbt-preview://" .. model) + vim.api.nvim_buf_set_lines(buf, 0, -1, false, output) + vim.bo[buf].modifiable = false + -- Resize to fit content (max 20 lines) + local height = math.min(#output, 20) + vim.api.nvim_win_set_height(0, height) + vim.keymap.set("n", "q", "<cmd>close<cr>", { buffer = buf, silent = true }) + end) + end, + }) +end, { desc = "[D]bt [P]review model rows" }) + +-- Show model output as CSV piped into visidata +vim.keymap.set("n", "<leader>dv", function() + local model = dbt_model_name() + if not model then + return + end + local root = dbt_project_root() + if not root then + vim.notify("No dbt_project.yml found", vim.log.levels.WARN) + return + end + require("conform").format({ async = false, lsp_fallback = true }) + vim.cmd("write") + local prev_win = vim.api.nvim_get_current_win() + local json_to_csv = [[python3 -c " +import sys, json, csv +raw = sys.stdin.read() +for line in raw.splitlines(): + try: + obj = json.loads(line) + except json.JSONDecodeError: + continue + preview = None + if 'data' in obj and 'preview' in obj['data']: + preview = obj['data']['preview'] + elif 'results' in obj: + preview = obj['results'][0].get('preview') + elif 'preview' in obj: + preview = obj['preview'] + if preview: + if isinstance(preview, str): + preview = json.loads(preview) + w = csv.DictWriter(sys.stdout, fieldnames=preview[0].keys()) + w.writeheader() + w.writerows(preview) + break +"]] + local cmd = "cd " .. root .. " && uv run dbt show -s " .. model .. " --limit 500 --output json --log-format json | " .. json_to_csv .. " | vd -f csv" + require("toggleterm.terminal").Terminal + :new({ + cmd = cmd, + close_on_exit = true, + on_exit = function() + vim.schedule(function() + if vim.api.nvim_win_is_valid(prev_win) then + vim.api.nvim_set_current_win(prev_win) + end + end) + end, + }) + :toggle() +end, { desc = "[D]bt [V]isidata preview" }) + +-- Run cursor-agent on current model (quick analysis with sonnet) +vim.keymap.set("n", "<leader>da", function() + local filepath = vim.fn.expand("%:p") + if filepath == "" then + vim.notify("No file open", vim.log.levels.WARN) + return + end + local prompt = dbt_load_prompt("dbt_quick_analysis", {}) + if not prompt then + return + end + local bufnr = vim.api.nvim_get_current_buf() + + vim.notify("Running quick analysis...", vim.log.levels.INFO) + + local file_content = table.concat(vim.fn.readfile(filepath), "\n") + local full_prompt = prompt .. "\n\nFile: " .. filepath .. "\n```sql\n" .. file_content .. "\n```" + local cmd = { "cursor-agent", "--print", "--model", "sonnet-4.6", full_prompt } + local output = {} + vim.fn.jobstart(cmd, { + stdout_buffered = true, + on_stdout = function(_, data) + if data then + output = data + end + end, + on_exit = function(_, exit_code) + vim.schedule(function() + if exit_code ~= 0 then + vim.notify("cursor-agent exited with code " .. exit_code, vim.log.levels.ERROR) + return + end + -- Remove trailing empty strings from jobstart output + while #output > 0 and output[#output] == "" do + table.remove(output) + end + if #output == 0 then + vim.notify("No output from cursor-agent", vim.log.levels.WARN) + return + end + vim.api.nvim_buf_set_lines(bufnr, 0, -1, false, output) + vim.notify("Inline comments added — review and :w to save, or :u to undo", vim.log.levels.INFO) + end) + end, + }) +end, { desc = "[D]bt [A]nalyse model (quick)" }) + +-- Open interactive cursor-agent session in a new tmux window with compiled SQL + sample rows as context +vim.keymap.set("n", "<leader>dA", function() + local filepath = vim.fn.expand("%:p") + if filepath == "" then + vim.notify("No file open", vim.log.levels.WARN) + return + end + local model = dbt_model_name() + if not model then + return + end + local root = dbt_project_root() + if not root then + vim.notify("No dbt_project.yml found", vim.log.levels.WARN) + return + end + + -- Open a new tmux window running the standalone dbt_analyse.py script + local prompt_path = vim.fn.stdpath("config") .. "/dbt/dbt_deep_analysis.md" + local script_path = vim.fn.stdpath("config") .. "/dbt/dbt_analyse.py" + local shell_script = string.format( + [[cd %s && uv run python3 %s --model %s --root %s --filepath %s --prompt %s || (echo "Press enter to close..." && read)]], + root, script_path, model, root, filepath, prompt_path + ) + vim.fn.jobstart({ "tmux", "new-window", "-n", "dbt:" .. model, shell_script }, { detach = true }) + vim.notify("Opened interactive cursor-agent session in tmux window 'dbt:" .. model .. "'", vim.log.levels.INFO) +end, { desc = "[D]bt [A]nalyse model (interactive)" }) diff --git a/nvim/lua/config/keymaps.lua b/nvim/lua/config/keymaps.lua index 2e13c82..6dab4bf 100644 --- a/nvim/lua/config/keymaps.lua +++ b/nvim/lua/config/keymaps.lua @@ -52,410 +52,5 @@ vim.keymap.set("n", "<leader>wi", function() }) end, { noremap = true, silent = true, desc = "[W]iki [I]nsert Link" }) --- dbt: extract model name from current file path (e.g., models/staging/stg_orders.sql -> stg_orders) -local function dbt_model_name() - local filepath = vim.fn.expand("%:t:r") - if filepath == "" then - vim.notify("No file open", vim.log.levels.WARN) - return nil - end - return filepath -end - --- dbt: format the current SQL file, then send a dbt command to the terminal -local function dbt_cmd(cmd_template) - local model = dbt_model_name() - if not model then - return - end - - -- Format with conform (sqlfmt), then save - require("conform").format({ async = false, lsp_fallback = true }) - vim.cmd("write") - - -- Build the command - local cmd = string.format(cmd_template, model) - - -- Send to toggleterm (terminal 1), then return focus to the code window - local prev_win = vim.api.nvim_get_current_win() - local term = require("toggleterm.terminal").get(1) - if not term then - term = require("toggleterm.terminal").Terminal:new({ id = 1 }) - end - if not term:is_open() then - term:toggle() - end - term:send(cmd) - vim.api.nvim_set_current_win(prev_win) -end - --- dbt: find the project root (directory containing dbt_project.yml) -local function dbt_project_root() - local path = vim.fn.findfile("dbt_project.yml", ".;") - if path == "" then - return nil - end - return vim.fn.fnamemodify(path, ":p:h") -end - --- dbt: jump to model under cursor from {{ ref('model_name') }} or {{ source('src', 'table') }} -vim.keymap.set("n", "<leader>dg", function() - local line = vim.api.nvim_get_current_line() - local col = vim.api.nvim_win_get_cursor(0)[2] + 1 - - -- Try to find ref('model') or ref("model") around cursor - local ref_model = nil - for start_pos, name, end_pos in line:gmatch("()ref%(['\"]([^'\"]+)['\"]%)()" ) do - if col >= start_pos and col <= end_pos then - ref_model = name - break - end - end - - -- Try source('source_name', 'table_name') if no ref found - local source_name, source_table = nil, nil - if not ref_model then - for start_pos, src, tbl, end_pos in line:gmatch("()source%(['\"]([^'\"]+)['\"]%s*,%s*['\"]([^'\"]+)['\"]%)()" ) do - if col >= start_pos and col <= end_pos then - source_name, source_table = src, tbl - break - end - end - end - - if not ref_model and not source_table then - vim.notify("No ref() or source() under cursor", vim.log.levels.WARN) - return - end - - local root = dbt_project_root() - if not root then - vim.notify("No dbt_project.yml found", vim.log.levels.WARN) - return - end - - -- Search for the model file - local search_name = ref_model or source_table - local matches = vim.fn.globpath(root, "**/" .. search_name .. ".sql", false, true) - if #matches == 0 then - -- Also try .yml for source definitions - matches = vim.fn.globpath(root, "**/" .. search_name .. ".yml", false, true) - end - - if #matches == 1 then - vim.cmd.edit(matches[1]) - elseif #matches > 1 then - vim.ui.select(matches, { prompt = "Multiple matches:" }, function(choice) - if choice then - vim.cmd.edit(choice) - end - end) - else - vim.notify("No file found for: " .. search_name, vim.log.levels.WARN) - end -end, { desc = "[D]bt [G]o to ref/source" }) - --- dbt: send a raw command string to toggleterm (used by picker actions) -local function dbt_cmd_raw(cmd) - local term = require("toggleterm.terminal").get(1) - if not term then - term = require("toggleterm.terminal").Terminal:new({ id = 1 }) - end - if not term:is_open() then - term:toggle() - end - term:send(cmd) -end - --- dbt: fuzzy model picker — select a model then choose an action -vim.keymap.set("n", "<leader>df", function() - local root = dbt_project_root() - if not root then - vim.notify("No dbt_project.yml found", vim.log.levels.WARN) - return - end - - local actions = require("telescope.actions") - local action_state = require("telescope.actions.state") - require("telescope.builtin").find_files({ - prompt_title = "dbt model (enter=open, C-r=run, C-b=build, C-t=test)", - cwd = root, - search_dirs = { "models" }, - find_command = { "fd", "-e", "sql" }, - attach_mappings = function(prompt_bufnr, map) - local function get_model_name() - local entry = action_state.get_selected_entry() - if entry then - return entry[1]:match("([^/]+)%.sql$") - end - end - map("i", "<C-r>", function() - local name = get_model_name() - actions.close(prompt_bufnr) - if name then dbt_cmd_raw("uv run dbt run -s " .. name) end - end) - map("i", "<C-b>", function() - local name = get_model_name() - actions.close(prompt_bufnr) - if name then dbt_cmd_raw("uv run dbt build -s " .. name) end - end) - map("i", "<C-t>", function() - local name = get_model_name() - actions.close(prompt_bufnr) - if name then dbt_cmd_raw("uv run dbt test -s " .. name) end - end) - return true - end, - }) -end, { desc = "[D]bt [F]ind model" }) - --- dbt: open the compiled SQL for the current model in a split -vim.keymap.set("n", "<leader>do", function() - local model = dbt_model_name() - if not model then - return - end - local root = dbt_project_root() - if not root then - vim.notify("No dbt_project.yml found", vim.log.levels.WARN) - return - end - local compiled = vim.fn.globpath(root, "target/compiled/**/" .. model .. ".sql", false, true) - if #compiled == 0 then - vim.notify("No compiled SQL found — run dbt compile first", vim.log.levels.WARN) - return - end - vim.cmd("vsplit " .. compiled[1]) - vim.bo.readonly = true - vim.bo.modifiable = false -end, { desc = "[D]bt [O]pen compiled SQL" }) - --- dbt: grep across all models (search for column names, CTEs, etc.) -vim.keymap.set("n", "<leader>d/", function() - local root = dbt_project_root() - if not root then - vim.notify("No dbt_project.yml found", vim.log.levels.WARN) - return - end - require("telescope.builtin").live_grep({ prompt_title = "dbt grep", cwd = root .. "/models" }) -end, { desc = "[D]bt search models" }) - -vim.keymap.set("n", "<leader>dr", function() - dbt_cmd("uv run dbt run -s %s") -end, { desc = "[D]bt [R]un current model" }) - -vim.keymap.set("n", "<leader>dR", function() - dbt_cmd("uv run dbt run -s %s+") -end, { desc = "[D]bt [R]un model + downstream" }) - -vim.keymap.set("n", "<leader>db", function() - dbt_cmd("uv run dbt build -s %s") -end, { desc = "[D]bt [B]uild current model (run + test)" }) - -vim.keymap.set("n", "<leader>dc", function() - dbt_cmd("uv run dbt compile -s %s") -end, { desc = "[D]bt [C]ompile current model" }) - -vim.keymap.set("n", "<leader>dt", function() - dbt_cmd("uv run dbt test -s %s") -end, { desc = "[D]bt [T]est current model" }) - -vim.keymap.set("n", "<leader>ds", function() - dbt_cmd("uv run dbt show -s %s") -end, { desc = "[D]bt [S]how preview results" }) - --- dbt: preview sample rows in a horizontal split -vim.keymap.set("n", "<leader>dp", function() - local model = dbt_model_name() - if not model then - return - end - vim.notify("Fetching preview for " .. model .. "...", vim.log.levels.INFO) - local cmd = { "uv", "run", "dbt", "show", "-s", model, "--limit", "20" } - local output = {} - vim.fn.jobstart(cmd, { - cwd = dbt_project_root(), - stdout_buffered = true, - stderr_buffered = true, - on_stdout = function(_, data) - if data then - vim.list_extend(output, data) - end - end, - on_exit = function(_, exit_code) - vim.schedule(function() - if exit_code ~= 0 then - vim.notify("dbt show failed (exit " .. exit_code .. ")", vim.log.levels.ERROR) - return - end - -- Trim trailing empty lines - while #output > 0 and output[#output] == "" do - table.remove(output) - end - if #output == 0 then - vim.notify("No rows returned", vim.log.levels.WARN) - return - end - -- Open a scratch buffer in a horizontal split - vim.cmd("botright new") - local buf = vim.api.nvim_get_current_buf() - vim.bo[buf].buftype = "nofile" - vim.bo[buf].bufhidden = "wipe" - vim.bo[buf].swapfile = false - vim.bo[buf].filetype = "sql" - vim.api.nvim_buf_set_name(buf, "dbt-preview://" .. model) - vim.api.nvim_buf_set_lines(buf, 0, -1, false, output) - vim.bo[buf].modifiable = false - -- Resize to fit content (max 20 lines) - local height = math.min(#output, 20) - vim.api.nvim_win_set_height(0, height) - vim.keymap.set("n", "q", "<cmd>close<cr>", { buffer = buf, silent = true }) - end) - end, - }) -end, { desc = "[D]bt [P]review model rows" }) - --- dbt: show model output as CSV piped into visidata -vim.keymap.set("n", "<leader>dv", function() - local model = dbt_model_name() - if not model then - return - end - local root = dbt_project_root() - if not root then - vim.notify("No dbt_project.yml found", vim.log.levels.WARN) - return - end - require("conform").format({ async = false, lsp_fallback = true }) - vim.cmd("write") - local prev_win = vim.api.nvim_get_current_win() - local json_to_csv = [[python3 -c " -import sys, json, csv -raw = sys.stdin.read() -for line in raw.splitlines(): - try: - obj = json.loads(line) - except json.JSONDecodeError: - continue - preview = None - if 'data' in obj and 'preview' in obj['data']: - preview = obj['data']['preview'] - elif 'results' in obj: - preview = obj['results'][0].get('preview') - elif 'preview' in obj: - preview = obj['preview'] - if preview: - if isinstance(preview, str): - preview = json.loads(preview) - w = csv.DictWriter(sys.stdout, fieldnames=preview[0].keys()) - w.writeheader() - w.writerows(preview) - break -"]] - local cmd = "cd " .. root .. " && uv run dbt show -s " .. model .. " --limit 500 --output json --log-format json | " .. json_to_csv .. " | vd -f csv" - require("toggleterm.terminal").Terminal - :new({ - cmd = cmd, - close_on_exit = true, - on_exit = function() - vim.schedule(function() - if vim.api.nvim_win_is_valid(prev_win) then - vim.api.nvim_set_current_win(prev_win) - end - end) - end, - }) - :toggle() -end, { desc = "[D]bt [V]isidata preview" }) - --- dbt: read a prompt template from the llm/ directory and substitute {{key}} placeholders -local function dbt_load_prompt(name, vars) - local prompt_dir = vim.fn.stdpath("config") .. "/dbt/" - local path = prompt_dir .. name .. ".md" - local lines = vim.fn.readfile(path) - if #lines == 0 then - vim.notify("Prompt not found: " .. path, vim.log.levels.ERROR) - return nil - end - local prompt = table.concat(lines, "\n") - for key, value in pairs(vars or {}) do - prompt = prompt:gsub("{{" .. key .. "}}", value) - end - return prompt -end - --- dbt: run cursor-agent on current model (quick analysis with sonnet) --- Replaces buffer contents with the file + inline SQL comments -vim.keymap.set("n", "<leader>da", function() - local filepath = vim.fn.expand("%:p") - if filepath == "" then - vim.notify("No file open", vim.log.levels.WARN) - return - end - local prompt = dbt_load_prompt("dbt_quick_analysis", {}) - if not prompt then - return - end - local bufnr = vim.api.nvim_get_current_buf() - - vim.notify("Running quick analysis...", vim.log.levels.INFO) - - local file_content = table.concat(vim.fn.readfile(filepath), "\n") - local full_prompt = prompt .. "\n\nFile: " .. filepath .. "\n```sql\n" .. file_content .. "\n```" - local cmd = { "cursor-agent", "--print", "--model", "sonnet-4.6", full_prompt } - local output = {} - vim.fn.jobstart(cmd, { - stdout_buffered = true, - on_stdout = function(_, data) - if data then - output = data - end - end, - on_exit = function(_, exit_code) - vim.schedule(function() - if exit_code ~= 0 then - vim.notify("cursor-agent exited with code " .. exit_code, vim.log.levels.ERROR) - return - end - -- Remove trailing empty strings from jobstart output - while #output > 0 and output[#output] == "" do - table.remove(output) - end - if #output == 0 then - vim.notify("No output from cursor-agent", vim.log.levels.WARN) - return - end - vim.api.nvim_buf_set_lines(bufnr, 0, -1, false, output) - vim.notify("Inline comments added — review and :w to save, or :u to undo", vim.log.levels.INFO) - end) - end, - }) -end, { desc = "[D]bt [A]nalyse model (quick)" }) - --- dbt: open interactive cursor-agent session in a new tmux window with compiled SQL + sample rows as context -vim.keymap.set("n", "<leader>dA", function() - local filepath = vim.fn.expand("%:p") - if filepath == "" then - vim.notify("No file open", vim.log.levels.WARN) - return - end - local model = dbt_model_name() - if not model then - return - end - local root = dbt_project_root() - if not root then - vim.notify("No dbt_project.yml found", vim.log.levels.WARN) - return - end - - -- Open a new tmux window running the standalone dbt_analyse.py script - local prompt_path = vim.fn.stdpath("config") .. "/dbt/dbt_deep_analysis.md" - local script_path = vim.fn.stdpath("config") .. "/dbt/dbt_analyse.py" - local shell_script = string.format( - [[cd %s && uv run python3 %s --model %s --root %s --filepath %s --prompt %s || (echo "Press enter to close..." && read)]], - root, script_path, model, root, filepath, prompt_path - ) - vim.fn.jobstart({ "tmux", "new-window", "-n", "dbt:" .. model, shell_script }, { detach = true }) - vim.notify("Opened interactive cursor-agent session in tmux window 'dbt:" .. model .. "'", vim.log.levels.INFO) -end, { desc = "[D]bt [A]nalyse model (interactive)" }) +-- dbt keymaps (loaded from separate file) +require("config.dbt") From f31143b9bf2471c63d06e44818fa20a81d790377 Mon Sep 17 00:00:00 2001 From: Claude <noreply@anthropic.com> Date: Sun, 8 Mar 2026 22:43:46 +0000 Subject: [PATCH 24/26] Add full spec for data-audit Python package Covers artifact types (dbt models, notebooks, flat files, Quarto docs), the PM orchestrator loop, structured findings model, LLM backend abstraction, CLI/API design, and phased implementation plan. https://claude.ai/code/session_019YdrjfrB6Lgu5QTZzrnXdb --- dbt/SPEC.md | 549 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 549 insertions(+) create mode 100644 dbt/SPEC.md diff --git a/dbt/SPEC.md b/dbt/SPEC.md new file mode 100644 index 0000000..cdd97d5 --- /dev/null +++ b/dbt/SPEC.md @@ -0,0 +1,549 @@ +# data-audit: specification + +A Python package for LLM-powered auditing of data artifacts — dbt models, SQL +scripts, Jupyter notebooks, Quarto documents, and flat data files. Designed to +be run by a single analyst or orchestrated by an autonomous "project manager" +agent that can expand the scope of an audit based on initial findings. + +## Problem + +Data teams accumulate a mix of dbt models, ad-hoc notebooks, CSV/Parquet +exports, and glue scripts. Quality issues hide at the seams: a notebook reads a +stale CSV export instead of the dbt model, a model silently fans out on a join, +a Quarto report hardcodes a date filter that drifts. Today's tooling audits +each artifact type in isolation — if at all. + +## Goals + +1. Audit heterogeneous data artifacts (dbt SQL, notebooks, flat files) through + a unified interface. +2. Support an autonomous orchestration loop where an LLM "project manager" can + review initial findings and spawn follow-up audits when warranted. +3. Produce structured, machine-readable findings alongside human-readable + reports so that both people and agents can act on results. +4. Be LLM-backend agnostic — work with direct API calls, local models, or CLI + wrappers. +5. Ship as an installable Python package (`pip install data-audit`) with a CLI + entry point. + +## Non-goals + +- Replacing dbt test or Great Expectations for deterministic, rule-based + testing. This tool is for exploratory, LLM-driven analysis. +- Real-time or CI-blocking checks. This is an offline review tool. +- Supporting non-Python notebook kernels (R, Julia) in the first version. + +--- + +## Artifact types + +### dbt models (.sql within a dbt project) + +This is the existing capability. Context gathering includes: + +- Compile the model (`dbt compile`) +- Fetch sample rows (`dbt show --output json`) +- Extract lineage — immediate parents and children (`dbt ls`) +- Discover existing tests from `schema.yml` files +- Read the source SQL + +Audit focus: join correctness, grain/uniqueness, filter logic, type mismatches, +test coverage gaps, upstream dependency risks. See the existing +`dbt_deep_analysis.md` prompt template for the full checklist. + +### dbt schema files (.yml) + +Context gathering: + +- Parse the YAML +- Cross-reference with the models it describes (do all models have schema + entries? do all columns exist?) +- Check test coverage across models + +Audit focus: missing descriptions, undocumented columns, missing or weak tests, +inconsistent naming conventions, stale entries for deleted models. + +### Jupyter notebooks (.ipynb) + +Notebooks vary widely. An analyst might be: + +- Running SQL queries against a warehouse +- Reading CSV/Parquet/Excel files from disk or S3 +- Pulling data from an API +- Doing pandas/polars transformations +- Producing charts or summary statistics + +Context gathering: + +- Extract all code cells and their outputs (where present) +- Identify data sources: SQL connection strings, file paths + (`pd.read_csv(...)`, `pd.read_parquet(...)`), API calls +- Extract imported libraries to understand the toolchain +- Note cell execution order and whether outputs are stale (execution count + gaps, missing outputs) +- If the notebook references dbt models or known tables, cross-reference with + the dbt project + +Audit focus: +- **Data source hygiene**: is the notebook reading raw files that should come + from a managed source? Are file paths hardcoded or parameterized? Are + connection strings embedded in code? +- **Reproducibility**: are cells ordered logically? Are there cells that depend + on execution side effects? Are random seeds set? Are outputs present and + consistent with the code? +- **Transformation logic**: are there pandas/polars transformations that + duplicate or contradict dbt model logic? Could this logic be pushed into the + transformation layer? +- **Data quality in-notebook**: are there silent drops (inner joins, dropna) + that could hide problems? Are filters reasonable? Are aggregations correct? +- **Staleness**: do outputs reference dates/values that suggest the notebook + hasn't been re-run recently? + +### Quarto documents (.qmd) + +Similar to notebooks but with additional structure: + +- YAML front matter (params, output format, execution options) +- Mixed prose and code chunks +- Cross-references and callouts + +Context gathering: + +- Parse YAML front matter for parameters and execution options +- Extract code chunks (```{python} blocks) +- Identify data sources same as notebooks +- Note `freeze: true` or other execution-control settings + +Audit focus: same as notebooks, plus parameter handling (are params used +consistently? are defaults reasonable?), and whether `freeze` settings mean +outputs could be stale. + +### Flat data files (.csv, .parquet, .json, .xlsx) + +Not code artifacts — these are data. An analyst might ask "audit this export" +or "check this CSV for issues." The PM agent might also flag a file referenced +by a notebook for closer inspection. + +Context gathering: + +- Read schema/column names and types +- Compute basic profiling: row count, null rates, cardinality, min/max/mean + for numerics, sample values for categoricals, date ranges for timestamps +- For Parquet: extract embedded schema metadata +- For CSV: detect delimiter, encoding, quoting issues +- If the file is referenced by a notebook or dbt model, note that relationship + +Audit focus: +- **Schema issues**: mixed types in columns, inconsistent date formats, + encoding problems +- **Completeness**: unexpected nulls, empty columns, truncated rows +- **Distribution anomalies**: outliers, impossible values (negative ages, + future dates), suspicious cardinality (a "country" column with 3 values) +- **Staleness**: if file metadata includes a timestamp, flag if it's old + relative to the project + +--- + +## Architecture + +### Core data model + +``` +Finding + id: str # unique within an audit run + artifact: str # path or identifier of the audited artifact + artifact_type: str # "dbt_model", "notebook", "data_file", etc. + category: str # "join_correctness", "data_quality", "reproducibility", etc. + severity: Severity # critical | warning | info + title: str # one-line summary + description: str # full explanation + evidence: str | None # query, code snippet, or data sample that demonstrates the issue + suggested_fix: str | None + downstream_impact: list[str] # artifact IDs affected if this isn't fixed + metadata: dict # arbitrary extra context (column names, line numbers, etc.) + +AuditResult + artifact: str + artifact_type: str + findings: list[Finding] + context_summary: str # what context was gathered (for traceability) + llm_model: str # which LLM produced this result + duration_seconds: float + raw_report: str # the full markdown report as returned by the LLM + +AuditPlan + tasks: list[AuditTask] # the full set of tasks (initial + follow-ups) + +AuditTask + artifact: str + artifact_type: str + prompt_template: str # which prompt to use + reason: str # why this task exists ("initial scan" or "follow-up: NULL propagation from stg_orders") + priority: int + parent_task_id: str | None # if this is a follow-up, which task spawned it + status: pending | running | completed | failed +``` + +### Auditor interface + +Each artifact type has an auditor that implements: + +```python +class Auditor(Protocol): + """Gathers context for an artifact and produces an LLM-ready prompt.""" + + artifact_type: str + + def can_handle(self, path: Path) -> bool: + """Return True if this auditor knows how to handle the given path.""" + ... + + def gather_context(self, path: Path, config: AuditConfig) -> AuditContext: + """Read the artifact and its surroundings, return structured context.""" + ... + + def render_prompt(self, context: AuditContext, template: str) -> str: + """Fill the prompt template with gathered context.""" + ... + + def parse_findings(self, raw_response: str, context: AuditContext) -> list[Finding]: + """Extract structured findings from the LLM's raw response.""" + ... +``` + +Concrete implementations: + +- `DbtModelAuditor` — wraps the existing compile/show/lineage/test-discovery logic +- `DbtSchemaAuditor` — parses YAML, cross-references models +- `NotebookAuditor` — extracts cells, identifies data sources, checks reproducibility +- `QuartoAuditor` — extends NotebookAuditor with front-matter parsing +- `DataFileAuditor` — profiles CSVs, Parquet, JSON, Excel files + +### LLM backend interface + +```python +class LLMBackend(Protocol): + """Send a prompt to an LLM and get a response.""" + + def complete(self, prompt: str, system: str | None = None) -> str: + ... + + def list_models(self) -> list[str]: + ... +``` + +Concrete implementations: + +- `CursorAgentBackend` — wraps the current `cursor-agent` CLI (preserves + existing workflow) +- `AnthropicBackend` — direct Claude API calls via the Anthropic SDK +- `LiteLLMBackend` — any model supported by litellm (OpenAI, local, etc.) + +### Orchestrator (the "project manager") + +The orchestrator is the key new capability. It operates in a loop: + +``` +1. Build initial AuditPlan from user-provided paths +2. While there are pending tasks and budget remains: + a. Pick next task(s) by priority + b. Run auditor(s) in parallel → collect AuditResults + c. Pass results to PM agent with the question: + "Given these findings, should we investigate anything further?" + d. PM agent returns zero or more follow-up AuditTasks + (e.g., "audit downstream model X", "profile data file Y referenced + in notebook Z", "do a deeper join analysis on model W") + e. Add follow-up tasks to the plan +3. Synthesize all AuditResults into a final report +``` + +**Budget controls** — the orchestrator enforces limits to prevent runaway +analysis: + +- `max_depth`: how many rounds of follow-ups (default: 2) +- `max_tasks`: total task cap across all rounds (default: 20) +- `max_wall_clock`: overall time limit (default: 30 minutes) +- `max_tokens`: approximate token budget across all LLM calls + +The PM agent prompt instructs it to be conservative — only spawn follow-up +tasks when there's concrete evidence that a finding has broader implications, +not speculatively. + +### Task graph + +Tasks form a tree (not a DAG — a follow-up task has exactly one parent). The +graph is used for: + +- Tracing why an audit was run ("this file was audited because the PM agent + identified it as a downstream consumer of a model with a join fan-out") +- Reporting: the final synthesis groups findings by their provenance +- Budget tracking: depth = longest path from root to leaf + +### Prompt templates + +Ship as package data under `data_audit/prompts/`: + +``` +dbt_model_deep.md # existing dbt_deep_analysis.md +dbt_model_quick.md # existing dbt_quick_analysis.md +dbt_schema.md # schema.yml audit +notebook.md # Jupyter/Quarto notebook audit +data_file.md # flat file profiling + audit +orchestrator.md # PM agent system prompt +synthesis.md # final report synthesis prompt +``` + +The existing Handlebars-style template engine (`{{var}}`, `{{#if var}}`, +`{{^if var}}`) is retained. It's simple and sufficient. + +--- + +## CLI + +``` +data-audit [OPTIONS] PATHS... +``` + +**Arguments:** + +- `PATHS` — files, directories, or globs. The tool auto-detects artifact type + by extension (`.sql` in a dbt project → dbt model, `.ipynb` → notebook, + `.csv`/`.parquet` → data file, etc.). Directories are walked recursively. + +**Options:** + +| Flag | Default | Description | +|------|---------|-------------| +| `--llm` | (required, repeatable) | LLM model to use for audits | +| `--backend` | `anthropic` | LLM backend: `anthropic`, `cursor-agent`, `litellm` | +| `--dbt-root` | auto-detect | Path to dbt project root (for dbt artifacts) | +| `--prompt` | built-in | Override the prompt template | +| `--output-dir` | `./audit_reports` | Where to write reports | +| `--output-format` | `markdown` | `markdown`, `json`, or `both` | +| `--limit` | `20` | Row limit for dbt show / data file sampling | +| `--synthesis-model` | same as `--llm` | LLM for synthesis step | +| `--orchestrate` | `false` | Enable the PM orchestration loop | +| `--max-depth` | `2` | Max follow-up rounds (requires `--orchestrate`) | +| `--max-tasks` | `20` | Max total audit tasks (requires `--orchestrate`) | +| `--concurrency` | `3` | Max parallel LLM calls | +| `--timeout` | `900` | Per-task timeout in seconds | +| `--verbose` | `false` | Show detailed progress | + +**Examples:** + +```bash +# Audit a few dbt models (current behavior, just packaged) +data-audit models/stg_orders.sql models/int_revenue.sql \ + --llm claude-sonnet-4-6 --dbt-root . + +# Audit a notebook +data-audit analysis/revenue_deep_dive.ipynb --llm claude-sonnet-4-6 + +# Audit an export file +data-audit exports/monthly_summary.csv --llm claude-sonnet-4-6 + +# Audit an entire directory with PM orchestration +data-audit models/ notebooks/ exports/ \ + --llm claude-sonnet-4-6 \ + --orchestrate --max-depth 2 --max-tasks 15 + +# JSON output for programmatic consumption +data-audit models/stg_orders.sql --llm claude-sonnet-4-6 --output-format json +``` + +--- + +## Programmatic API + +```python +from data_audit import audit, AuditConfig +from data_audit.backends import AnthropicBackend + +config = AuditConfig( + llm="claude-sonnet-4-6", + backend=AnthropicBackend(), + concurrency=3, + orchestrate=True, + max_depth=2, +) + +# Audit specific files +results = audit(["models/stg_orders.sql", "analysis/deep_dive.ipynb"], config) + +for finding in results.all_findings(): + print(f"[{finding.severity}] {finding.artifact}: {finding.title}") + +# Access the structured data +results.to_json("audit_output.json") +results.to_markdown("audit_report.md") +``` + +--- + +## Package structure + +``` +data-audit/ + pyproject.toml + src/data_audit/ + __init__.py # public API: audit(), AuditConfig + cli.py # click CLI entry point + + # Core data model + models.py # Finding, AuditResult, AuditTask, AuditPlan + + # Auditors — one per artifact type + auditors/ + __init__.py + base.py # Auditor protocol + dbt_model.py # compile, show, lineage, test discovery + dbt_schema.py # schema.yml cross-referencing + notebook.py # .ipynb extraction and analysis + quarto.py # .qmd front-matter + code chunks + data_file.py # CSV, Parquet, JSON, Excel profiling + registry.py # maps file extensions/patterns → auditors + + # Context gathering utilities + context/ + __init__.py + dbt.py # dbt compile/show/ls wrappers + notebook_parser.py # cell extraction, data source detection + file_profiler.py # column stats, null rates, distributions + template.py # Handlebars-style template rendering + + # LLM backends + backends/ + __init__.py + base.py # LLMBackend protocol + anthropic.py # direct Anthropic API + cursor_agent.py # subprocess wrapper for cursor-agent CLI + litellm.py # litellm pass-through + + # Orchestration + orchestrator/ + __init__.py + pm_agent.py # project manager LLM logic + task_graph.py # task tree, budget tracking + executor.py # parallel task execution + synthesizer.py # final report generation + + # Shipped prompt templates + prompts/ + dbt_model_deep.md + dbt_model_quick.md + dbt_schema.md + notebook.md + quarto.md + data_file.md + orchestrator.md + synthesis.md +``` + +--- + +## Migration path from current code + +The existing scripts (`dbt_batch_audit.py`, `dbt_analyse.py`) become thin +wrappers or are replaced entirely by the package CLI. Mapping: + +| Current | Package equivalent | +|---------|--------------------| +| `dbt_batch_audit.py` | `data-audit models/*.sql --llm ... --dbt-root .` | +| `dbt_analyse.py` (interactive tmux) | `data-audit --interactive models/stg_orders.sql` (or keep as a separate nvim-specific script that calls the package API) | +| `dbt_deep_analysis.md` | `src/data_audit/prompts/dbt_model_deep.md` | +| `dbt_quick_analysis.md` | `src/data_audit/prompts/dbt_model_quick.md` | +| `render_template()` | `src/data_audit/context/template.py` | +| `compile_model()`, `get_sample_rows()`, etc. | `src/data_audit/context/dbt.py` | +| `run_audit()` | `src/data_audit/orchestrator/executor.py` | +| `synthesize_reports()` | `src/data_audit/orchestrator/synthesizer.py` | +| cursor-agent hardcoding | `src/data_audit/backends/cursor_agent.py` (one of N backends) | + +The nvim integration (`nvim/lua/config/dbt.lua`) continues to work — it just +calls `data-audit` instead of the raw Python scripts. + +--- + +## Dependencies + +**Required:** + +- `click` — CLI framework +- `pyyaml` — YAML parsing (dbt schema files, Quarto front matter) + +**Optional (extras):** + +- `anthropic` — for the Anthropic backend (`pip install data-audit[anthropic]`) +- `litellm` — for the litellm backend (`pip install data-audit[litellm]`) +- `mdformat` — for markdown formatting of output reports +- `nbformat` — for robust Jupyter notebook parsing (fallback: raw JSON parsing) +- `pyarrow` or `polars` — for Parquet file profiling +- `openpyxl` — for Excel file profiling + +--- + +## Implementation phases + +### Phase 1: Extract and package + +Move the existing dbt model audit logic into the package structure. Get +`data-audit models/*.sql --llm ... --dbt-root .` working identically to the +current `dbt_batch_audit.py`. Ship the Auditor protocol, the dbt model auditor, +the cursor-agent backend, and the existing prompt templates. No new +capabilities — just packaging. + +### Phase 2: LLM backends + structured output + +Add the Anthropic and litellm backends. Implement the `Finding` data model and +`parse_findings()` so that results are available as structured data (JSON +output). This unblocks the orchestrator since the PM agent needs machine-readable +findings to reason over. + +### Phase 3: Notebook and data file auditors + +Implement `NotebookAuditor`, `QuartoAuditor`, and `DataFileAuditor` with their +context gatherers and prompt templates. At this point `data-audit` can audit +any artifact type, but only in a flat batch (no orchestration loop). + +### Phase 4: Orchestrator + +Implement the PM agent loop: initial plan → audit → review findings → spawn +follow-ups → synthesize. This is the most architecturally complex phase and +benefits from phases 2-3 being stable first. + +### Phase 5: Cross-artifact analysis + +Teach the orchestrator to reason across artifact types: "this notebook reads +the same table that dbt model X writes to — flag if findings in X affect the +notebook's conclusions." This requires a lightweight project graph (dbt lineage ++ notebook data source detection + file references). + +--- + +## Open questions + +1. **Package name**: `data-audit` is clean but generic. Alternatives: + `llm-data-audit`, `data-artifact-audit`, `dbt-audit-llm`. The scope beyond + dbt suggests avoiding `dbt-` as a prefix. + +2. **Structured finding extraction**: should the LLM be asked to return JSON + directly (fragile), or should we post-process the markdown response with a + second LLM call or regex-based parser? A hybrid approach (ask for markdown + with a machine-readable YAML block at the end) may be most robust. + +3. **Database access for data file profiling**: profiling flat files requires + reading them. For large files, should the tool spin up a temporary DuckDB + instance to run profiling queries, or rely on pandas/polars sampling? DuckDB + is likely the right answer since it handles CSV/Parquet/JSON natively and + the audit prompts already assume SQL-based evidence queries. + +4. **Interactive mode**: the current `dbt_analyse.py` launches an interactive + agent session in tmux. Should the package support this, or should it remain + a thin wrapper in the dotfiles that calls the package API? Leaning toward + the latter — interactive UX is editor-specific and doesn't belong in the + package. + +5. **Notebook output handling**: should the auditor include cell outputs + (which can be large — images, dataframes, tracebacks) in the LLM context? + Probably: include text outputs and truncated dataframe representations, + skip images unless the LLM supports vision, always note when outputs are + missing. From 01a367ba5ffda662248d6fef5fabd9ad0ff462b2 Mon Sep 17 00:00:00 2001 From: Claude <noreply@anthropic.com> Date: Tue, 10 Mar 2026 01:44:59 +0000 Subject: [PATCH 25/26] Fix black formatting in dbt Python scripts Merge master and reformat dbt_analyse.py and dbt_batch_audit.py to pass the CI formatting checks. https://claude.ai/code/session_01BPxJNKYipwwrx17jvyv6rM --- dbt/dbt_analyse.py | 52 +++++++++++++----- dbt/dbt_batch_audit.py | 117 ++++++++++++++++++++++++++++------------- 2 files changed, 118 insertions(+), 51 deletions(-) diff --git a/dbt/dbt_analyse.py b/dbt/dbt_analyse.py index f065d22..dd92fa3 100755 --- a/dbt/dbt_analyse.py +++ b/dbt/dbt_analyse.py @@ -42,13 +42,22 @@ def run(cmd, cwd=None, capture=False, check=True): def get_lineage(model, root): """Return a summary of immediate parents and children from dbt ls.""" lines = [] - for direction, selector in [("parents", f"+{model},1+{model}"), ("children", f"{model}+,{model}1+")]: + for direction, selector in [ + ("parents", f"+{model},1+{model}"), + ("children", f"{model}+,{model}1+"), + ]: result = subprocess.run( ["uv", "run", "dbt", "ls", "-s", selector, "--output", "name", "--quiet"], - capture_output=True, text=True, cwd=root, + capture_output=True, + text=True, + cwd=root, ) if result.returncode == 0: - names = [n.strip() for n in result.stdout.strip().splitlines() if n.strip() and n.strip() != model] + names = [ + n.strip() + for n in result.stdout.strip().splitlines() + if n.strip() and n.strip() != model + ] if names: lines.append(f"**{direction.title()}:** {', '.join(names)}") return "\n".join(lines) if lines else "" @@ -86,7 +95,6 @@ def get_existing_tests(model, root): return "\n".join(tests) if tests else "" - def render_template(template, replacements): """Replace template placeholders, handling conditional {{#if}}/{{^if}} blocks.""" for key, value in replacements.items(): @@ -138,8 +146,18 @@ def main(model, root, filepath, prompt, limit, model_flag): click.echo(f"Fetching sample rows (limit={limit})...") result = run( [ - "uv", "run", "dbt", "show", "-s", model, - "--limit", str(limit), "--output", "json", "--log-format", "json", + "uv", + "run", + "dbt", + "show", + "-s", + model, + "--limit", + str(limit), + "--output", + "json", + "--log-format", + "json", ], cwd=root, capture=True, @@ -175,18 +193,24 @@ def main(model, root, filepath, prompt, limit, model_flag): with open(prompt) as f: template = f.read() - full_prompt = render_template(template, { - "compiled_sql": compiled_sql, - "sample_rows": sample_rows, - "existing_tests": existing_tests, - "lineage": lineage, - "data_profile": "", - }) + full_prompt = render_template( + template, + { + "compiled_sql": compiled_sql, + "sample_rows": sample_rows, + "existing_tests": existing_tests, + "lineage": lineage, + "data_profile": "", + }, + ) full_prompt += f"\n\nSource SQL:\n{source_sql}" # --- 7. write context to a temp file & launch cursor-agent --- ctx = tempfile.NamedTemporaryFile( - mode="w", suffix=".md", prefix=f"dbt_audit_{model}_", delete=False, + mode="w", + suffix=".md", + prefix=f"dbt_audit_{model}_", + delete=False, ) ctx.write(full_prompt) ctx.close() diff --git a/dbt/dbt_batch_audit.py b/dbt/dbt_batch_audit.py index 6fca44a..36a8683 100644 --- a/dbt/dbt_batch_audit.py +++ b/dbt/dbt_batch_audit.py @@ -66,8 +66,18 @@ def get_sample_rows(model_name, root, limit): click.echo(f" Fetching sample rows for {model_name} (limit={limit})...") result = run( [ - "uv", "run", "dbt", "show", "-s", model_name, - "--limit", str(limit), "--output", "json", "--log-format", "json", + "uv", + "run", + "dbt", + "show", + "-s", + model_name, + "--limit", + str(limit), + "--output", + "json", + "--log-format", + "json", ], cwd=root, capture=True, @@ -82,13 +92,22 @@ def get_sample_rows(model_name, root, limit): def get_lineage(model_name, root): """Return a summary of immediate parents and children from dbt ls.""" lines = [] - for direction, selector in [("parents", f"+{model_name},1+{model_name}"), ("children", f"{model_name}+,{model_name}1+")]: + for direction, selector in [ + ("parents", f"+{model_name},1+{model_name}"), + ("children", f"{model_name}+,{model_name}1+"), + ]: result = subprocess.run( ["uv", "run", "dbt", "ls", "-s", selector, "--output", "name", "--quiet"], - capture_output=True, text=True, cwd=root, + capture_output=True, + text=True, + cwd=root, ) if result.returncode == 0: - names = [n.strip() for n in result.stdout.strip().splitlines() if n.strip() and n.strip() != model_name] + names = [ + n.strip() + for n in result.stdout.strip().splitlines() + if n.strip() and n.strip() != model_name + ] if names: lines.append(f"**{direction.title()}:** {', '.join(names)}") return "\n".join(lines) if lines else "" @@ -151,18 +170,30 @@ def render_template(template, replacements): return template -def write_context_file(output_dir, model_name, template, compiled_sql, sample_rows, source_sql, lineage="", existing_tests=""): +def write_context_file( + output_dir, + model_name, + template, + compiled_sql, + sample_rows, + source_sql, + lineage="", + existing_tests="", +): """Write the full audit context to a file so cursor-agent can read it.""" ctx_dir = os.path.join(output_dir, ".context") os.makedirs(ctx_dir, exist_ok=True) - content = render_template(template, { - "compiled_sql": compiled_sql, - "sample_rows": sample_rows, - "existing_tests": existing_tests, - "lineage": lineage, - "data_profile": "", - }) + content = render_template( + template, + { + "compiled_sql": compiled_sql, + "sample_rows": sample_rows, + "existing_tests": existing_tests, + "lineage": lineage, + "data_profile": "", + }, + ) content += f"\n\nSource SQL:\n{source_sql}" ctx_path = os.path.join(ctx_dir, f"{model_name}__context.md") @@ -205,7 +236,8 @@ def validate_llms(llms): click.echo( f"ERROR: the following model(s) are not available in cursor-agent:\n" + "\n".join(f" - {m}" for m in invalid) - + f"\n\nAvailable models:\n " + "\n ".join(sorted(available)), + + f"\n\nAvailable models:\n " + + "\n ".join(sorted(available)), err=True, ) sys.exit(1) @@ -262,9 +294,7 @@ def synthesize_reports(reports, synthesis_model, output_dir, root): combined_parts = [] for model_name, llm, report in reports: - combined_parts.append( - f"---\n## Model: {model_name} | Reviewer: {llm}\n\n{report}\n" - ) + combined_parts.append(f"---\n## Model: {model_name} | Reviewer: {llm}\n\n{report}\n") combined_text = "\n".join(combined_parts) combined_path = os.path.join(output_dir, "all_individual_reports.md") @@ -274,9 +304,7 @@ def synthesize_reports(reports, synthesis_model, output_dir, root): # Write synthesis context to a file to avoid command-line length limits ctx_dir = os.path.join(output_dir, ".context") os.makedirs(ctx_dir, exist_ok=True) - synthesis_ctx_path = os.path.abspath( - os.path.join(ctx_dir, "synthesis_context.md") - ) + synthesis_ctx_path = os.path.abspath(os.path.join(ctx_dir, "synthesis_context.md")) synthesis_instructions = f"""\ You are a senior analytics engineer reviewing multiple dbt model audit reports. @@ -341,9 +369,7 @@ def synthesize_reports(reports, synthesis_model, output_dir, root): if result.returncode == 0: synthesis = result.stdout.strip() else: - synthesis = ( - f"(synthesis failed: exit {result.returncode})\n{result.stderr}" - ) + synthesis = f"(synthesis failed: exit {result.returncode})\n{result.stderr}" synthesis_path = os.path.join(output_dir, "final_synthesis.md") with open(synthesis_path, "w") as f: @@ -361,9 +387,7 @@ def resolve_sql_paths(paths): for p in paths: p = os.path.abspath(p) if os.path.isdir(p): - children = sorted( - f for f in glob.glob(os.path.join(p, "**", "*.sql"), recursive=True) - ) + children = sorted(f for f in glob.glob(os.path.join(p, "**", "*.sql"), recursive=True)) if not children: click.echo(f"WARNING: no .sql files found in {p}", err=True) resolved.extend(children) @@ -385,22 +409,31 @@ def model_name_from_path(filepath): @click.command() @click.argument("paths", nargs=-1, required=True) @click.option( - "--llm", "llms", required=True, multiple=True, + "--llm", + "llms", + required=True, + multiple=True, help="LLM model name for cursor-agent (repeatable)", ) @click.option("--root", required=True, help="Path to dbt project root") @click.option("--prompt", required=True, help="Path to the prompt template .md file") @click.option( - "--output-dir", default="./audit_reports", show_default=True, + "--output-dir", + default="./audit_reports", + show_default=True, help="Directory for output reports", ) @click.option("--limit", default=20, show_default=True, help="Row limit for dbt show") @click.option( - "--synthesis-model", default="sonnet-4.6-thinking", show_default=True, + "--synthesis-model", + default="sonnet-4.6-thinking", + show_default=True, help="LLM for the final synthesis step", ) @click.option( - "--concurrency", default=3, show_default=True, + "--concurrency", + default=3, + show_default=True, help="Max parallel cursor-agent invocations", ) def main(paths, llms, root, prompt, output_dir, limit, synthesis_model, concurrency): @@ -422,7 +455,8 @@ def main(paths, llms, root, prompt, output_dir, limit, synthesis_model, concurre name = model_name_from_path(filepath) if name in seen: click.echo( - f"ERROR: duplicate model name '{name}' from {filepath}", err=True, + f"ERROR: duplicate model name '{name}' from {filepath}", + err=True, ) sys.exit(1) seen.add(name) @@ -439,9 +473,7 @@ def main(paths, llms, root, prompt, output_dir, limit, synthesis_model, concurre os.makedirs(output_dir, exist_ok=True) total = len(model_specs) * len(llms) - click.echo( - f"Auditing {len(model_specs)} model(s) × {len(llms)} LLM(s) = {total} audit(s)" - ) + click.echo(f"Auditing {len(model_specs)} model(s) × {len(llms)} LLM(s) = {total} audit(s)") click.echo(f"Concurrency: {concurrency} | Synthesis model: {synthesis_model}\n") context_paths = {} @@ -456,8 +488,14 @@ def main(paths, llms, root, prompt, output_dir, limit, synthesis_model, concurre click.echo(f" Scanning tests for {name}...") existing_tests = get_existing_tests(name, root) context_paths[name] = write_context_file( - output_dir, name, template, compiled_sql, sample_rows, source_sql, - lineage=lineage, existing_tests=existing_tests, + output_dir, + name, + template, + compiled_sql, + sample_rows, + source_sql, + lineage=lineage, + existing_tests=existing_tests, ) click.echo(f"\nAll models compiled. Launching {total} audit(s)...\n") @@ -468,7 +506,12 @@ def main(paths, llms, root, prompt, output_dir, limit, synthesis_model, concurre for name, _ in model_specs: for llm in llms: fut = pool.submit( - run_audit, name, context_paths[name], llm, output_dir, root, + run_audit, + name, + context_paths[name], + llm, + output_dir, + root, ) futures[fut] = (name, llm) From e1269e96d283636eb84656945cca9f79d2d4054c Mon Sep 17 00:00:00 2001 From: Michael Barton <mail@michaelbarton.me.uk> Date: Mon, 9 Mar 2026 18:57:35 -0700 Subject: [PATCH 26/26] Delete dbt/SPEC.md --- dbt/SPEC.md | 549 ---------------------------------------------------- 1 file changed, 549 deletions(-) delete mode 100644 dbt/SPEC.md diff --git a/dbt/SPEC.md b/dbt/SPEC.md deleted file mode 100644 index cdd97d5..0000000 --- a/dbt/SPEC.md +++ /dev/null @@ -1,549 +0,0 @@ -# data-audit: specification - -A Python package for LLM-powered auditing of data artifacts — dbt models, SQL -scripts, Jupyter notebooks, Quarto documents, and flat data files. Designed to -be run by a single analyst or orchestrated by an autonomous "project manager" -agent that can expand the scope of an audit based on initial findings. - -## Problem - -Data teams accumulate a mix of dbt models, ad-hoc notebooks, CSV/Parquet -exports, and glue scripts. Quality issues hide at the seams: a notebook reads a -stale CSV export instead of the dbt model, a model silently fans out on a join, -a Quarto report hardcodes a date filter that drifts. Today's tooling audits -each artifact type in isolation — if at all. - -## Goals - -1. Audit heterogeneous data artifacts (dbt SQL, notebooks, flat files) through - a unified interface. -2. Support an autonomous orchestration loop where an LLM "project manager" can - review initial findings and spawn follow-up audits when warranted. -3. Produce structured, machine-readable findings alongside human-readable - reports so that both people and agents can act on results. -4. Be LLM-backend agnostic — work with direct API calls, local models, or CLI - wrappers. -5. Ship as an installable Python package (`pip install data-audit`) with a CLI - entry point. - -## Non-goals - -- Replacing dbt test or Great Expectations for deterministic, rule-based - testing. This tool is for exploratory, LLM-driven analysis. -- Real-time or CI-blocking checks. This is an offline review tool. -- Supporting non-Python notebook kernels (R, Julia) in the first version. - ---- - -## Artifact types - -### dbt models (.sql within a dbt project) - -This is the existing capability. Context gathering includes: - -- Compile the model (`dbt compile`) -- Fetch sample rows (`dbt show --output json`) -- Extract lineage — immediate parents and children (`dbt ls`) -- Discover existing tests from `schema.yml` files -- Read the source SQL - -Audit focus: join correctness, grain/uniqueness, filter logic, type mismatches, -test coverage gaps, upstream dependency risks. See the existing -`dbt_deep_analysis.md` prompt template for the full checklist. - -### dbt schema files (.yml) - -Context gathering: - -- Parse the YAML -- Cross-reference with the models it describes (do all models have schema - entries? do all columns exist?) -- Check test coverage across models - -Audit focus: missing descriptions, undocumented columns, missing or weak tests, -inconsistent naming conventions, stale entries for deleted models. - -### Jupyter notebooks (.ipynb) - -Notebooks vary widely. An analyst might be: - -- Running SQL queries against a warehouse -- Reading CSV/Parquet/Excel files from disk or S3 -- Pulling data from an API -- Doing pandas/polars transformations -- Producing charts or summary statistics - -Context gathering: - -- Extract all code cells and their outputs (where present) -- Identify data sources: SQL connection strings, file paths - (`pd.read_csv(...)`, `pd.read_parquet(...)`), API calls -- Extract imported libraries to understand the toolchain -- Note cell execution order and whether outputs are stale (execution count - gaps, missing outputs) -- If the notebook references dbt models or known tables, cross-reference with - the dbt project - -Audit focus: -- **Data source hygiene**: is the notebook reading raw files that should come - from a managed source? Are file paths hardcoded or parameterized? Are - connection strings embedded in code? -- **Reproducibility**: are cells ordered logically? Are there cells that depend - on execution side effects? Are random seeds set? Are outputs present and - consistent with the code? -- **Transformation logic**: are there pandas/polars transformations that - duplicate or contradict dbt model logic? Could this logic be pushed into the - transformation layer? -- **Data quality in-notebook**: are there silent drops (inner joins, dropna) - that could hide problems? Are filters reasonable? Are aggregations correct? -- **Staleness**: do outputs reference dates/values that suggest the notebook - hasn't been re-run recently? - -### Quarto documents (.qmd) - -Similar to notebooks but with additional structure: - -- YAML front matter (params, output format, execution options) -- Mixed prose and code chunks -- Cross-references and callouts - -Context gathering: - -- Parse YAML front matter for parameters and execution options -- Extract code chunks (```{python} blocks) -- Identify data sources same as notebooks -- Note `freeze: true` or other execution-control settings - -Audit focus: same as notebooks, plus parameter handling (are params used -consistently? are defaults reasonable?), and whether `freeze` settings mean -outputs could be stale. - -### Flat data files (.csv, .parquet, .json, .xlsx) - -Not code artifacts — these are data. An analyst might ask "audit this export" -or "check this CSV for issues." The PM agent might also flag a file referenced -by a notebook for closer inspection. - -Context gathering: - -- Read schema/column names and types -- Compute basic profiling: row count, null rates, cardinality, min/max/mean - for numerics, sample values for categoricals, date ranges for timestamps -- For Parquet: extract embedded schema metadata -- For CSV: detect delimiter, encoding, quoting issues -- If the file is referenced by a notebook or dbt model, note that relationship - -Audit focus: -- **Schema issues**: mixed types in columns, inconsistent date formats, - encoding problems -- **Completeness**: unexpected nulls, empty columns, truncated rows -- **Distribution anomalies**: outliers, impossible values (negative ages, - future dates), suspicious cardinality (a "country" column with 3 values) -- **Staleness**: if file metadata includes a timestamp, flag if it's old - relative to the project - ---- - -## Architecture - -### Core data model - -``` -Finding - id: str # unique within an audit run - artifact: str # path or identifier of the audited artifact - artifact_type: str # "dbt_model", "notebook", "data_file", etc. - category: str # "join_correctness", "data_quality", "reproducibility", etc. - severity: Severity # critical | warning | info - title: str # one-line summary - description: str # full explanation - evidence: str | None # query, code snippet, or data sample that demonstrates the issue - suggested_fix: str | None - downstream_impact: list[str] # artifact IDs affected if this isn't fixed - metadata: dict # arbitrary extra context (column names, line numbers, etc.) - -AuditResult - artifact: str - artifact_type: str - findings: list[Finding] - context_summary: str # what context was gathered (for traceability) - llm_model: str # which LLM produced this result - duration_seconds: float - raw_report: str # the full markdown report as returned by the LLM - -AuditPlan - tasks: list[AuditTask] # the full set of tasks (initial + follow-ups) - -AuditTask - artifact: str - artifact_type: str - prompt_template: str # which prompt to use - reason: str # why this task exists ("initial scan" or "follow-up: NULL propagation from stg_orders") - priority: int - parent_task_id: str | None # if this is a follow-up, which task spawned it - status: pending | running | completed | failed -``` - -### Auditor interface - -Each artifact type has an auditor that implements: - -```python -class Auditor(Protocol): - """Gathers context for an artifact and produces an LLM-ready prompt.""" - - artifact_type: str - - def can_handle(self, path: Path) -> bool: - """Return True if this auditor knows how to handle the given path.""" - ... - - def gather_context(self, path: Path, config: AuditConfig) -> AuditContext: - """Read the artifact and its surroundings, return structured context.""" - ... - - def render_prompt(self, context: AuditContext, template: str) -> str: - """Fill the prompt template with gathered context.""" - ... - - def parse_findings(self, raw_response: str, context: AuditContext) -> list[Finding]: - """Extract structured findings from the LLM's raw response.""" - ... -``` - -Concrete implementations: - -- `DbtModelAuditor` — wraps the existing compile/show/lineage/test-discovery logic -- `DbtSchemaAuditor` — parses YAML, cross-references models -- `NotebookAuditor` — extracts cells, identifies data sources, checks reproducibility -- `QuartoAuditor` — extends NotebookAuditor with front-matter parsing -- `DataFileAuditor` — profiles CSVs, Parquet, JSON, Excel files - -### LLM backend interface - -```python -class LLMBackend(Protocol): - """Send a prompt to an LLM and get a response.""" - - def complete(self, prompt: str, system: str | None = None) -> str: - ... - - def list_models(self) -> list[str]: - ... -``` - -Concrete implementations: - -- `CursorAgentBackend` — wraps the current `cursor-agent` CLI (preserves - existing workflow) -- `AnthropicBackend` — direct Claude API calls via the Anthropic SDK -- `LiteLLMBackend` — any model supported by litellm (OpenAI, local, etc.) - -### Orchestrator (the "project manager") - -The orchestrator is the key new capability. It operates in a loop: - -``` -1. Build initial AuditPlan from user-provided paths -2. While there are pending tasks and budget remains: - a. Pick next task(s) by priority - b. Run auditor(s) in parallel → collect AuditResults - c. Pass results to PM agent with the question: - "Given these findings, should we investigate anything further?" - d. PM agent returns zero or more follow-up AuditTasks - (e.g., "audit downstream model X", "profile data file Y referenced - in notebook Z", "do a deeper join analysis on model W") - e. Add follow-up tasks to the plan -3. Synthesize all AuditResults into a final report -``` - -**Budget controls** — the orchestrator enforces limits to prevent runaway -analysis: - -- `max_depth`: how many rounds of follow-ups (default: 2) -- `max_tasks`: total task cap across all rounds (default: 20) -- `max_wall_clock`: overall time limit (default: 30 minutes) -- `max_tokens`: approximate token budget across all LLM calls - -The PM agent prompt instructs it to be conservative — only spawn follow-up -tasks when there's concrete evidence that a finding has broader implications, -not speculatively. - -### Task graph - -Tasks form a tree (not a DAG — a follow-up task has exactly one parent). The -graph is used for: - -- Tracing why an audit was run ("this file was audited because the PM agent - identified it as a downstream consumer of a model with a join fan-out") -- Reporting: the final synthesis groups findings by their provenance -- Budget tracking: depth = longest path from root to leaf - -### Prompt templates - -Ship as package data under `data_audit/prompts/`: - -``` -dbt_model_deep.md # existing dbt_deep_analysis.md -dbt_model_quick.md # existing dbt_quick_analysis.md -dbt_schema.md # schema.yml audit -notebook.md # Jupyter/Quarto notebook audit -data_file.md # flat file profiling + audit -orchestrator.md # PM agent system prompt -synthesis.md # final report synthesis prompt -``` - -The existing Handlebars-style template engine (`{{var}}`, `{{#if var}}`, -`{{^if var}}`) is retained. It's simple and sufficient. - ---- - -## CLI - -``` -data-audit [OPTIONS] PATHS... -``` - -**Arguments:** - -- `PATHS` — files, directories, or globs. The tool auto-detects artifact type - by extension (`.sql` in a dbt project → dbt model, `.ipynb` → notebook, - `.csv`/`.parquet` → data file, etc.). Directories are walked recursively. - -**Options:** - -| Flag | Default | Description | -|------|---------|-------------| -| `--llm` | (required, repeatable) | LLM model to use for audits | -| `--backend` | `anthropic` | LLM backend: `anthropic`, `cursor-agent`, `litellm` | -| `--dbt-root` | auto-detect | Path to dbt project root (for dbt artifacts) | -| `--prompt` | built-in | Override the prompt template | -| `--output-dir` | `./audit_reports` | Where to write reports | -| `--output-format` | `markdown` | `markdown`, `json`, or `both` | -| `--limit` | `20` | Row limit for dbt show / data file sampling | -| `--synthesis-model` | same as `--llm` | LLM for synthesis step | -| `--orchestrate` | `false` | Enable the PM orchestration loop | -| `--max-depth` | `2` | Max follow-up rounds (requires `--orchestrate`) | -| `--max-tasks` | `20` | Max total audit tasks (requires `--orchestrate`) | -| `--concurrency` | `3` | Max parallel LLM calls | -| `--timeout` | `900` | Per-task timeout in seconds | -| `--verbose` | `false` | Show detailed progress | - -**Examples:** - -```bash -# Audit a few dbt models (current behavior, just packaged) -data-audit models/stg_orders.sql models/int_revenue.sql \ - --llm claude-sonnet-4-6 --dbt-root . - -# Audit a notebook -data-audit analysis/revenue_deep_dive.ipynb --llm claude-sonnet-4-6 - -# Audit an export file -data-audit exports/monthly_summary.csv --llm claude-sonnet-4-6 - -# Audit an entire directory with PM orchestration -data-audit models/ notebooks/ exports/ \ - --llm claude-sonnet-4-6 \ - --orchestrate --max-depth 2 --max-tasks 15 - -# JSON output for programmatic consumption -data-audit models/stg_orders.sql --llm claude-sonnet-4-6 --output-format json -``` - ---- - -## Programmatic API - -```python -from data_audit import audit, AuditConfig -from data_audit.backends import AnthropicBackend - -config = AuditConfig( - llm="claude-sonnet-4-6", - backend=AnthropicBackend(), - concurrency=3, - orchestrate=True, - max_depth=2, -) - -# Audit specific files -results = audit(["models/stg_orders.sql", "analysis/deep_dive.ipynb"], config) - -for finding in results.all_findings(): - print(f"[{finding.severity}] {finding.artifact}: {finding.title}") - -# Access the structured data -results.to_json("audit_output.json") -results.to_markdown("audit_report.md") -``` - ---- - -## Package structure - -``` -data-audit/ - pyproject.toml - src/data_audit/ - __init__.py # public API: audit(), AuditConfig - cli.py # click CLI entry point - - # Core data model - models.py # Finding, AuditResult, AuditTask, AuditPlan - - # Auditors — one per artifact type - auditors/ - __init__.py - base.py # Auditor protocol - dbt_model.py # compile, show, lineage, test discovery - dbt_schema.py # schema.yml cross-referencing - notebook.py # .ipynb extraction and analysis - quarto.py # .qmd front-matter + code chunks - data_file.py # CSV, Parquet, JSON, Excel profiling - registry.py # maps file extensions/patterns → auditors - - # Context gathering utilities - context/ - __init__.py - dbt.py # dbt compile/show/ls wrappers - notebook_parser.py # cell extraction, data source detection - file_profiler.py # column stats, null rates, distributions - template.py # Handlebars-style template rendering - - # LLM backends - backends/ - __init__.py - base.py # LLMBackend protocol - anthropic.py # direct Anthropic API - cursor_agent.py # subprocess wrapper for cursor-agent CLI - litellm.py # litellm pass-through - - # Orchestration - orchestrator/ - __init__.py - pm_agent.py # project manager LLM logic - task_graph.py # task tree, budget tracking - executor.py # parallel task execution - synthesizer.py # final report generation - - # Shipped prompt templates - prompts/ - dbt_model_deep.md - dbt_model_quick.md - dbt_schema.md - notebook.md - quarto.md - data_file.md - orchestrator.md - synthesis.md -``` - ---- - -## Migration path from current code - -The existing scripts (`dbt_batch_audit.py`, `dbt_analyse.py`) become thin -wrappers or are replaced entirely by the package CLI. Mapping: - -| Current | Package equivalent | -|---------|--------------------| -| `dbt_batch_audit.py` | `data-audit models/*.sql --llm ... --dbt-root .` | -| `dbt_analyse.py` (interactive tmux) | `data-audit --interactive models/stg_orders.sql` (or keep as a separate nvim-specific script that calls the package API) | -| `dbt_deep_analysis.md` | `src/data_audit/prompts/dbt_model_deep.md` | -| `dbt_quick_analysis.md` | `src/data_audit/prompts/dbt_model_quick.md` | -| `render_template()` | `src/data_audit/context/template.py` | -| `compile_model()`, `get_sample_rows()`, etc. | `src/data_audit/context/dbt.py` | -| `run_audit()` | `src/data_audit/orchestrator/executor.py` | -| `synthesize_reports()` | `src/data_audit/orchestrator/synthesizer.py` | -| cursor-agent hardcoding | `src/data_audit/backends/cursor_agent.py` (one of N backends) | - -The nvim integration (`nvim/lua/config/dbt.lua`) continues to work — it just -calls `data-audit` instead of the raw Python scripts. - ---- - -## Dependencies - -**Required:** - -- `click` — CLI framework -- `pyyaml` — YAML parsing (dbt schema files, Quarto front matter) - -**Optional (extras):** - -- `anthropic` — for the Anthropic backend (`pip install data-audit[anthropic]`) -- `litellm` — for the litellm backend (`pip install data-audit[litellm]`) -- `mdformat` — for markdown formatting of output reports -- `nbformat` — for robust Jupyter notebook parsing (fallback: raw JSON parsing) -- `pyarrow` or `polars` — for Parquet file profiling -- `openpyxl` — for Excel file profiling - ---- - -## Implementation phases - -### Phase 1: Extract and package - -Move the existing dbt model audit logic into the package structure. Get -`data-audit models/*.sql --llm ... --dbt-root .` working identically to the -current `dbt_batch_audit.py`. Ship the Auditor protocol, the dbt model auditor, -the cursor-agent backend, and the existing prompt templates. No new -capabilities — just packaging. - -### Phase 2: LLM backends + structured output - -Add the Anthropic and litellm backends. Implement the `Finding` data model and -`parse_findings()` so that results are available as structured data (JSON -output). This unblocks the orchestrator since the PM agent needs machine-readable -findings to reason over. - -### Phase 3: Notebook and data file auditors - -Implement `NotebookAuditor`, `QuartoAuditor`, and `DataFileAuditor` with their -context gatherers and prompt templates. At this point `data-audit` can audit -any artifact type, but only in a flat batch (no orchestration loop). - -### Phase 4: Orchestrator - -Implement the PM agent loop: initial plan → audit → review findings → spawn -follow-ups → synthesize. This is the most architecturally complex phase and -benefits from phases 2-3 being stable first. - -### Phase 5: Cross-artifact analysis - -Teach the orchestrator to reason across artifact types: "this notebook reads -the same table that dbt model X writes to — flag if findings in X affect the -notebook's conclusions." This requires a lightweight project graph (dbt lineage -+ notebook data source detection + file references). - ---- - -## Open questions - -1. **Package name**: `data-audit` is clean but generic. Alternatives: - `llm-data-audit`, `data-artifact-audit`, `dbt-audit-llm`. The scope beyond - dbt suggests avoiding `dbt-` as a prefix. - -2. **Structured finding extraction**: should the LLM be asked to return JSON - directly (fragile), or should we post-process the markdown response with a - second LLM call or regex-based parser? A hybrid approach (ask for markdown - with a machine-readable YAML block at the end) may be most robust. - -3. **Database access for data file profiling**: profiling flat files requires - reading them. For large files, should the tool spin up a temporary DuckDB - instance to run profiling queries, or rely on pandas/polars sampling? DuckDB - is likely the right answer since it handles CSV/Parquet/JSON natively and - the audit prompts already assume SQL-based evidence queries. - -4. **Interactive mode**: the current `dbt_analyse.py` launches an interactive - agent session in tmux. Should the package support this, or should it remain - a thin wrapper in the dotfiles that calls the package API? Leaning toward - the latter — interactive UX is editor-specific and doesn't belong in the - package. - -5. **Notebook output handling**: should the auditor include cell outputs - (which can be large — images, dataframes, tracebacks) in the LLM context? - Probably: include text outputs and truncated dataframe representations, - skip images unless the LLM supports vision, always note when outputs are - missing.