From 0b301090d8a4e08584cd0b2393d7bcb76f21becc Mon Sep 17 00:00:00 2001 From: Louis Choquel <8851983+lchoquel@users.noreply.github.com> Date: Sun, 14 Dec 2025 22:16:53 +0100 Subject: [PATCH 1/8] Use Pipelex Gateway (local dep) --- .env.example | 6 +- .pipelex/inference/backends.toml | 38 ++- .pipelex/inference/backends/anthropic.toml | 9 + .pipelex/inference/backends/azure_openai.toml | 39 ++- .pipelex/inference/backends/bedrock.toml | 9 + .pipelex/inference/backends/blackboxai.toml | 27 +- .pipelex/inference/backends/fal.toml | 40 +++ .pipelex/inference/backends/google.toml | 27 ++ .pipelex/inference/backends/groq.toml | 22 +- .pipelex/inference/backends/internal.toml | 8 + .pipelex/inference/backends/mistral.toml | 4 +- .pipelex/inference/backends/ollama.toml | 2 +- .pipelex/inference/backends/openai.toml | 35 +-- .../inference/backends/pipelex_gateway.toml | 41 +++ .../inference/backends/pipelex_inference.toml | 56 +++- .pipelex/inference/backends/portkey.toml | 272 ++++++++++++++++++ .pipelex/inference/backends/scaleway.toml | 67 +++++ .pipelex/inference/backends/vertexai.toml | 3 +- .pipelex/inference/backends/xai.toml | 2 +- .pipelex/inference/deck/base_deck.toml | 73 +++-- .pipelex/inference/routing_profiles.toml | 46 ++- .pipelex/pipelex_service.toml | 19 ++ .pipelex/telemetry.toml | 87 +++++- cocode/exceptions.py | 11 +- cocode/repox/repox_processor.py | 4 +- cocode/validation_cli.py | 5 +- pyproject.toml | 3 + tests/conftest.py | 13 +- uv.lock | 251 +++++++++++++++- 29 files changed, 1081 insertions(+), 138 deletions(-) create mode 100644 .pipelex/inference/backends/pipelex_gateway.toml create mode 100644 .pipelex/inference/backends/portkey.toml create mode 100644 .pipelex/inference/backends/scaleway.toml create mode 100644 .pipelex/pipelex_service.toml diff --git a/.env.example b/.env.example index 65770ca..d4d4999 100644 --- a/.env.example +++ b/.env.example @@ -1,6 +1,6 @@ # [OPTIONAL] Free Pipelex Inference API key - Get yours on Discord: https://go.pipelex.com/discord # No credit card required, limited time offer -PIPELEX_INFERENCE_API_KEY= +PIPELEX_GATEWAY_API_KEY= # OpenAI: to use models like GPT-4o and GPT-5 OPENAI_API_KEY= @@ -21,10 +21,6 @@ ANTHROPIC_API_KEY= # To use Mistral models MISTRAL_API_KEY= -# To use perplexity, including results from web search -PERPLEXITY_API_KEY= -PERPLEXITY_API_ENDPOINT=https://api.perplexity.ai - # To generate images from fal.ai, the service of Forest Labs FAL_API_KEY= diff --git a/.pipelex/inference/backends.toml b/.pipelex/inference/backends.toml index 536f367..a243cb1 100644 --- a/.pipelex/inference/backends.toml +++ b/.pipelex/inference/backends.toml @@ -10,16 +10,15 @@ # #################################################################################################### -[pipelex_inference] -display_name = "⭐ Pipelex Inference" -enabled = true -endpoint = "https://inference.pipelex.com/v1" -api_key = "${PIPELEX_INFERENCE_API_KEY}" +[pipelex_gateway] +display_name = "⭐ Pipelex Gateway" +enabled = true # Enable after accepting terms via `pipelex init config` +api_key = "${PIPELEX_GATEWAY_API_KEY}" [anthropic] enabled = false api_key = "${ANTHROPIC_API_KEY}" -claude_4_tokens_limit = 8192 +valued_constraints = { max_output_tokens_limit = 8192 } [azure_openai] display_name = "Azure OpenAI" @@ -49,6 +48,12 @@ display_name = "Google AI" enabled = false api_key = "${GOOGLE_API_KEY}" +[groq] +display_name = "Groq" +enabled = false +endpoint = "https://api.groq.com/openai/v1" +api_key = "${GROQ_API_KEY}" + [mistral] display_name = "Mistral AI" enabled = false @@ -63,9 +68,21 @@ display_name = "OpenAI" enabled = false api_key = "${OPENAI_API_KEY}" +[portkey] +display_name = "Portkey" +enabled = false +endpoint = "https://api.portkey.ai/v1" +api_key = "${PORTKEY_API_KEY}" + +[scaleway] +display_name = "Scaleway" +enabled = false +endpoint = "${SCALEWAY_ENDPOINT}" +api_key = "${SCALEWAY_API_KEY}" + [vertexai] display_name = "Google Vertex AI" -enabled = false +enabled = false # This is the only one we disable beacuse setting it up requires internet access just to get credentials so it fails in CI sandboxes gcp_project_id = "${GCP_PROJECT_ID}" gcp_location = "${GCP_LOCATION}" gcp_credentials_file_path = "${GCP_CREDENTIALS_FILE_PATH}" @@ -78,3 +95,10 @@ api_key = "${XAI_API_KEY}" [internal] # software-only backend, runs internally, without AI enabled = true + +# Deprecated +[pipelex_inference] +display_name = "🛑 Legacy Pipelex Inference" +enabled = false +endpoint = "https://inference.pipelex.com/v1" +api_key = "${PIPELEX_INFERENCE_API_KEY}" diff --git a/.pipelex/inference/backends/anthropic.toml b/.pipelex/inference/backends/anthropic.toml index 66721e2..729e2d1 100644 --- a/.pipelex/inference/backends/anthropic.toml +++ b/.pipelex/inference/backends/anthropic.toml @@ -24,6 +24,7 @@ model_type = "llm" sdk = "anthropic" prompting_target = "anthropic" +structure_method = "instructor/anthropic_tools" ################################################################################ # LANGUAGE MODELS @@ -97,3 +98,11 @@ inputs = ["text", "images"] outputs = ["text", "structured"] max_prompt_images = 100 costs = { input = 1.0, output = 5.0 } + +["claude-4.5-opus"] +model_id = "claude-opus-4-5-20251101" +max_tokens = 64000 +inputs = ["text", "images"] +outputs = ["text", "structured"] +max_prompt_images = 100 +costs = { input = 5.0, output = 25.0 } diff --git a/.pipelex/inference/backends/azure_openai.toml b/.pipelex/inference/backends/azure_openai.toml index 930f161..35ccb48 100644 --- a/.pipelex/inference/backends/azure_openai.toml +++ b/.pipelex/inference/backends/azure_openai.toml @@ -22,8 +22,9 @@ [defaults] model_type = "llm" -sdk = "azure_openai" +sdk = "azure_openai_responses" prompting_target = "openai" +structure_method = "instructor/openai_responses_tools" ################################################################################ # LANGUAGE MODELS @@ -67,24 +68,28 @@ model_id = "o1-mini-2024-09-12" inputs = ["text"] outputs = ["text", "structured"] costs = { input = 3.0, output = 12.0 } +valued_constraints = { fixed_temperature = 1 } [o1] model_id = "o1-2024-12-17" inputs = ["text", "images"] outputs = ["text", "structured"] costs = { input = 15.0, output = 60.0 } +valued_constraints = { fixed_temperature = 1 } [o3-mini] model_id = "o3-mini-2025-01-31" inputs = ["text"] outputs = ["text", "structured"] costs = { input = 1.1, output = 4.4 } +valued_constraints = { fixed_temperature = 1 } [o3] model_id = "o3-2025-04-16" inputs = ["text"] outputs = ["text", "structured"] costs = { input = 2, output = 8 } +valued_constraints = { fixed_temperature = 1 } # --- GPT-5 Series ------------------------------------------------------------- [gpt-5-mini] @@ -92,21 +97,53 @@ model_id = "gpt-5-mini-2025-08-07" inputs = ["text", "images"] outputs = ["text", "structured"] costs = { input = 0.25, output = 2.0 } +valued_constraints = { fixed_temperature = 1 } [gpt-5-nano] model_id = "gpt-5-nano-2025-08-07" inputs = ["text", "images"] outputs = ["text", "structured"] costs = { input = 0.05, output = 0.4 } +valued_constraints = { fixed_temperature = 1 } [gpt-5-chat] model_id = "gpt-5-chat-2025-08-07" inputs = ["text", "images"] outputs = ["text", "structured"] costs = { input = 1.25, output = 10.0 } +valued_constraints = { fixed_temperature = 1 } [gpt-5] model_id = "gpt-5-2025-08-07" inputs = ["text", "images"] outputs = ["text", "structured"] costs = { input = 1.25, output = 10.0 } +valued_constraints = { fixed_temperature = 1 } + +# --- GPT-5.1 Series ------------------------------------------------------------- +["gpt-5.1"] +model_id = "gpt-5.1-2025-11-13" +inputs = ["text", "images"] +outputs = ["text", "structured"] +costs = { input = 1.25, output = 10.0 } +valued_constraints = { fixed_temperature = 1 } + +["gpt-5.1-chat"] +model_id = "gpt-5.1-chat-2025-11-13" +inputs = ["text", "images"] +outputs = ["text", "structured"] +costs = { input = 1.25, output = 10.0 } +valued_constraints = { fixed_temperature = 1 } + +################################################################################ +# IMAGE GENERATION MODELS +################################################################################ + +# --- OpenAI Image Generation -------------------------------------------------- +[gpt-image-1] +sdk = "azure_rest_img_gen" +model_type = "img_gen" +model_id = "gpt-image-1-2025-04-15" +inputs = ["text"] +outputs = ["image"] +costs = { input = 0.04, output = 0.0 } diff --git a/.pipelex/inference/backends/bedrock.toml b/.pipelex/inference/backends/bedrock.toml index cea9134..c4ab176 100644 --- a/.pipelex/inference/backends/bedrock.toml +++ b/.pipelex/inference/backends/bedrock.toml @@ -109,3 +109,12 @@ inputs = ["text", "images"] outputs = ["text", "structured"] max_prompt_images = 100 costs = { input = 1.0, output = 5.0 } + +["claude-4.5-opus"] +sdk = "bedrock_anthropic" +model_id = "global.anthropic.claude-opus-4-5-20251101-v1:0" +max_tokens = 8192 +inputs = ["text", "images"] +outputs = ["text", "structured"] +max_prompt_images = 100 +costs = { input = 5.0, output = 25.0 } diff --git a/.pipelex/inference/backends/blackboxai.toml b/.pipelex/inference/backends/blackboxai.toml index 4c5909a..9ee0433 100644 --- a/.pipelex/inference/backends/blackboxai.toml +++ b/.pipelex/inference/backends/blackboxai.toml @@ -23,6 +23,7 @@ [defaults] model_type = "llm" sdk = "openai" +structure_method = "instructor/openai_tools" ################################################################################ # LANGUAGE MODELS @@ -145,7 +146,7 @@ costs = { input = 0.12, output = 0.39 } ["qwen2.5-vl-72b-instruct"] model_id = "blackboxai/qwen/qwen2.5-vl-72b-instruct" inputs = ["text", "images"] -outputs = ["text", "structured"] +outputs = ["text"] costs = { input = 0.25, output = 0.75 } # --- Amazon Nova Models ------------------------------------------------------- @@ -166,13 +167,13 @@ costs = { input = 0.06, output = 0.24 } ################################################################################ # --- DeepSeek Free Models ----------------------------------------------------- -[deepseek-chat-free] +[deepseek-chat] model_id = "blackboxai/deepseek/deepseek-chat:free" inputs = ["text"] outputs = ["text", "structured"] costs = { input = 0.00, output = 0.00 } -[deepseek-r1-free] +[deepseek-r1] model_id = "blackboxai/deepseek/deepseek-r1:free" inputs = ["text"] outputs = ["text", "structured"] @@ -192,7 +193,7 @@ costs = { input = 0.00, output = 0.00 } [flux-pro] model_type = "img_gen" -sdk = "openai_alt_img_gen" +sdk = "blackboxai_img_gen" model_id = "blackboxai/black-forest-labs/flux-pro" inputs = ["text"] outputs = ["image"] @@ -200,7 +201,7 @@ costs = { input = 0.0, output = 0.04 } ["flux-pro/v1.1"] model_type = "img_gen" -sdk = "openai_alt_img_gen" +sdk = "blackboxai_img_gen" model_id = "blackboxai/black-forest-labs/flux-1.1-pro" inputs = ["text"] outputs = ["image"] @@ -208,15 +209,15 @@ costs = { input = 0.0, output = 0.04 } ["flux-pro/v1.1-ultra"] model_type = "img_gen" -sdk = "openai_alt_img_gen" +sdk = "blackboxai_img_gen" model_id = "blackboxai/black-forest-labs/flux-1.1-pro-ultra" inputs = ["text"] outputs = ["image"] costs = { input = 0.0, output = 0.06 } -["fast-lightning-sdxl"] +[fast-lightning-sdxl] model_type = "img_gen" -sdk = "openai_alt_img_gen" +sdk = "blackboxai_img_gen" model_id = "blackboxai/bytedance/sdxl-lightning-4step" inputs = ["text"] outputs = ["image"] @@ -224,8 +225,16 @@ costs = { input = 0.0, output = 0.0014 } [nano-banana] model_type = "img_gen" -sdk = "openai_alt_img_gen" +sdk = "blackboxai_img_gen" model_id = "blackboxai/google/nano-banana" inputs = ["text"] outputs = ["image"] costs = { input = 0.0, output = 0.039 } + +[nano-banana-pro] +model_type = "img_gen" +sdk = "blackboxai_img_gen" +model_id = "blackboxai/google/nano-banana-pro" +inputs = ["text"] +outputs = ["image"] +costs = { input = 0.0, output = 0.039 } diff --git a/.pipelex/inference/backends/fal.toml b/.pipelex/inference/backends/fal.toml index d54a25f..608bfed 100644 --- a/.pipelex/inference/backends/fal.toml +++ b/.pipelex/inference/backends/fal.toml @@ -36,18 +36,52 @@ inputs = ["text"] outputs = ["image"] costs = { input = 0.05, output = 0.0 } +[flux-pro.rules] +num_images = "fal" +aspect_ratio = "flux" +inference = "flux" +safety_checker = "available" +specific = "fal" + ["flux-pro/v1.1"] model_id = "fal-ai/flux-pro/v1.1" inputs = ["text"] outputs = ["image"] costs = { input = 0.05, output = 0.0 } +["flux-pro/v1.1".rules] +num_images = "fal" +aspect_ratio = "flux" +inference = "flux" +safety_checker = "available" +specific = "fal" + ["flux-pro/v1.1-ultra"] model_id = "fal-ai/flux-pro/v1.1-ultra" inputs = ["text"] outputs = ["image"] costs = { input = 0.06, output = 0.0 } +["flux-pro/v1.1-ultra".rules] +num_images = "fal" +aspect_ratio = "flux_11_ultra" +inference = "flux_11_ultra" +safety_checker = "available" +specific = "fal" + +[flux-2] +model_id = "fal-ai/flux-2" +inputs = ["text"] +outputs = ["image"] +costs = { input = 0.05, output = 0.0 } + +[flux-2.rules] +num_images = "fal" +aspect_ratio = "flux" +inference = "flux" +safety_checker = "available" +specific = "fal" + # --- SDXL models -------------------------------------------------------------- [fast-lightning-sdxl] model_id = "fal-ai/fast-lightning-sdxl" @@ -55,3 +89,9 @@ inputs = ["text"] outputs = ["image"] costs = { input = 0.0003, output = 0.0 } +[fast-lightning-sdxl.rules] +num_images = "fal" +specific = "fal" +aspect_ratio = "flux" +inference = "sdxl_lightning" +safety_checker = "unavailable" diff --git a/.pipelex/inference/backends/google.toml b/.pipelex/inference/backends/google.toml index 68c39d9..e0d3f0a 100644 --- a/.pipelex/inference/backends/google.toml +++ b/.pipelex/inference/backends/google.toml @@ -24,6 +24,7 @@ model_type = "llm" sdk = "google" prompting_target = "gemini" +structure_method = "instructor/genai_tools" ################################################################################ # LANGUAGE MODELS @@ -58,3 +59,29 @@ inputs = ["text", "images"] outputs = ["text", "structured"] max_prompt_images = 3000 costs = { input = 0.10, output = 0.40 } + +# --- Gemini 3.0 Series ---------------------------------------- +["gemini-3.0-pro"] +model_id = "gemini-3-pro-preview" +inputs = ["text", "images"] +outputs = ["text", "structured"] +max_prompt_images = 3000 +costs = { input = 2, output = 12.0 } + +################################################################################ +# IMAGE GENERATION MODELS (Nano Banana) +################################################################################ + +[nano-banana] +model_type = "img_gen" +model_id = "gemini-2.5-flash-image" +inputs = ["text"] +outputs = ["image"] +costs = { input = 0.0, output = 0.039 } + +[nano-banana-pro] +model_type = "img_gen" +model_id = "gemini-3-pro-image-preview" +inputs = ["text"] +outputs = ["image"] +costs = { input = 0.0, output = 0.039 } diff --git a/.pipelex/inference/backends/groq.toml b/.pipelex/inference/backends/groq.toml index f115546..72bdae3 100644 --- a/.pipelex/inference/backends/groq.toml +++ b/.pipelex/inference/backends/groq.toml @@ -23,8 +23,8 @@ [defaults] model_type = "llm" -sdk = "groq" -prompting_target = "groq" +sdk = "openai" +structure_method = "instructor/json" ################################################################################ # PRODUCTION TEXT MODELS @@ -46,7 +46,7 @@ outputs = ["text", "structured"] costs = { input = 0.59, output = 0.79 } # --- Meta Llama Guard --------------------------------------------------------- -["meta-llama/llama-guard-4-12b"] +[llama-guard-4-12b] model_id = "meta-llama/llama-guard-4-12b" max_tokens = 1024 inputs = ["text"] @@ -54,14 +54,14 @@ outputs = ["text", "structured"] costs = { input = 0.20, output = 0.20 } # --- OpenAI GPT-OSS Models ---------------------------------------------------- -["openai/gpt-oss-20b"] +[gpt-oss-20b] model_id = "openai/gpt-oss-20b" max_tokens = 65536 inputs = ["text"] outputs = ["text", "structured"] costs = { input = 0.075, output = 0.30 } -["openai/gpt-oss-120b"] +[gpt-oss-120b] model_id = "openai/gpt-oss-120b" max_tokens = 65536 inputs = ["text"] @@ -88,7 +88,7 @@ costs = { input = 0.10, output = 0.30 } ################################################################################ # --- Meta Llama 4 Vision Models (Preview) ------------------------------------- -["meta-llama/llama-4-scout-17b-16e-instruct"] +[llama-4-scout-17b-16e-instruct] model_id = "meta-llama/llama-4-scout-17b-16e-instruct" max_tokens = 8192 inputs = ["text", "images"] @@ -96,7 +96,7 @@ outputs = ["text", "structured"] max_prompt_images = 5 costs = { input = 0.11, output = 0.34 } -["meta-llama/llama-4-maverick-17b-128e-instruct"] +[llama-4-maverick-17b-128e-instruct] model_id = "meta-llama/llama-4-maverick-17b-128e-instruct" max_tokens = 8192 inputs = ["text", "images"] @@ -105,7 +105,7 @@ max_prompt_images = 5 costs = { input = 0.20, output = 0.60 } # --- Moonshot Kimi K2 --------------------------------------------------------- -["moonshotai/kimi-k2-instruct-0905"] +[kimi-k2-instruct-0905] model_id = "moonshotai/kimi-k2-instruct-0905" max_tokens = 16384 inputs = ["text"] @@ -113,7 +113,7 @@ outputs = ["text", "structured"] costs = { input = 1.00, output = 3.00 } # --- OpenAI Safety Model ------------------------------------------------------ -["openai/gpt-oss-safeguard-20b"] +[gpt-oss-safeguard-20b] model_id = "openai/gpt-oss-safeguard-20b" max_tokens = 65536 inputs = ["text"] @@ -121,9 +121,9 @@ outputs = ["text", "structured"] costs = { input = 0.075, output = 0.30 } # --- Qwen 3 ------------------------------------------------------------------- -["qwen/qwen3-32b"] +[qwen3-32b] model_id = "qwen/qwen3-32b" max_tokens = 40960 inputs = ["text"] outputs = ["text", "structured"] -costs = { input = 0.29, output = 0.59 } \ No newline at end of file +costs = { input = 0.29, output = 0.59 } diff --git a/.pipelex/inference/backends/internal.toml b/.pipelex/inference/backends/internal.toml index 5bb9683..2d30b7f 100644 --- a/.pipelex/inference/backends/internal.toml +++ b/.pipelex/inference/backends/internal.toml @@ -27,3 +27,11 @@ inputs = ["pdf"] outputs = ["pages"] costs = {} +# --- Docling Text Extractor --------------------------------------------------- +[docling-extract-text] +model_type = "text_extractor" +sdk = "docling_sdk" +model_id = "extract-text" +inputs = ["pdf", "image"] +outputs = ["pages"] +costs = {} diff --git a/.pipelex/inference/backends/mistral.toml b/.pipelex/inference/backends/mistral.toml index 97e40d0..c4b49e3 100644 --- a/.pipelex/inference/backends/mistral.toml +++ b/.pipelex/inference/backends/mistral.toml @@ -24,6 +24,7 @@ model_type = "llm" sdk = "mistral" prompting_target = "mistral" +structure_method = "instructor/mistral_tools" ################################################################################ # LANGUAGE MODELS @@ -129,7 +130,7 @@ outputs = ["text", "structured"] costs = { input = 0.4, output = 2.0 } ################################################################################ -# OCR MODELS +# EXTRACTION MODELS ################################################################################ # --- OCR Models --------------------------------------------------------------- @@ -140,4 +141,3 @@ max_tokens = 131072 inputs = ["pdf", "image"] outputs = ["pages"] costs = { input = 0.4, output = 2.0 } - diff --git a/.pipelex/inference/backends/ollama.toml b/.pipelex/inference/backends/ollama.toml index ab0f516..397e9ac 100644 --- a/.pipelex/inference/backends/ollama.toml +++ b/.pipelex/inference/backends/ollama.toml @@ -24,6 +24,7 @@ model_type = "llm" sdk = "openai" prompting_target = "anthropic" +structure_method = "instructor/openai_tools" ################################################################################ # LANGUAGE MODELS @@ -60,4 +61,3 @@ inputs = ["text"] outputs = ["text"] costs = { input = 0, output = 0 } # TODO: support tokens - diff --git a/.pipelex/inference/backends/openai.toml b/.pipelex/inference/backends/openai.toml index 4f11481..bfb3dd9 100644 --- a/.pipelex/inference/backends/openai.toml +++ b/.pipelex/inference/backends/openai.toml @@ -22,8 +22,9 @@ [defaults] model_type = "llm" -sdk = "openai" +sdk = "openai_responses" prompting_target = "openai" +structure_method = "instructor/openai_responses_tools" ################################################################################ # LANGUAGE MODELS @@ -92,40 +93,33 @@ outputs = ["text", "structured"] costs = { input = 0.1, output = 0.4 } # --- o Series ---------------------------------------------------------------- -[o1-mini] -model_id = "o1-mini" -inputs = ["text"] -outputs = ["text"] -costs = { input = 3.0, output = 12.0 } -constraints = ["temperature_must_be_1"] - [o1] model_id = "o1" inputs = ["text", "images"] outputs = ["text", "structured"] costs = { input = 15.0, output = 60.0 } -constraints = ["temperature_must_be_1"] +valued_constraints = { fixed_temperature = 1 } [o3-mini] model_id = "o3-mini" inputs = ["text"] outputs = ["text", "structured"] costs = { input = 1.1, output = 4.4 } -constraints = ["temperature_must_be_1"] +valued_constraints = { fixed_temperature = 1 } [o3] model_id = "o3" inputs = ["text", "images"] outputs = ["text", "structured"] costs = { input = 10.0, output = 40.0 } -constraints = ["temperature_must_be_1"] +valued_constraints = { fixed_temperature = 1 } [o4-mini] model_id = "o4-mini" inputs = ["text"] outputs = ["text", "structured"] costs = { input = 1.1, output = 4.4 } -constraints = ["temperature_must_be_1"] +valued_constraints = { fixed_temperature = 1 } # --- GPT-5 Series ------------------------------------------------------------- [gpt-5] @@ -133,28 +127,35 @@ model_id = "gpt-5" inputs = ["text", "images"] outputs = ["text", "structured"] costs = { input = 1.25, output = 10.0 } -constraints = ["temperature_must_be_1"] +valued_constraints = { fixed_temperature = 1 } [gpt-5-mini] model_id = "gpt-5-mini" inputs = ["text", "images"] outputs = ["text", "structured"] costs = { input = 0.25, output = 2.0 } -constraints = ["temperature_must_be_1"] +valued_constraints = { fixed_temperature = 1 } [gpt-5-nano] model_id = "gpt-5-nano" inputs = ["text", "images"] outputs = ["text", "structured"] costs = { input = 0.05, output = 0.4 } -constraints = ["temperature_must_be_1"] +valued_constraints = { fixed_temperature = 1 } [gpt-5-chat] model_id = "gpt-5-chat-latest" inputs = ["text", "images"] outputs = ["text"] costs = { input = 1.25, output = 10.0 } -constraints = ["temperature_must_be_1"] +valued_constraints = { fixed_temperature = 1 } + +# --- GPT-5.1 Series ------------------------------------------------------------- +["gpt-5.1"] +model_id = "gpt-5.1" +inputs = ["text", "images"] +outputs = ["text", "structured"] +costs = { input = 1.25, output = 10.0 } ################################################################################ # IMAGE GENERATION MODELS @@ -162,9 +163,9 @@ constraints = ["temperature_must_be_1"] # --- OpenAI Image Generation -------------------------------------------------- [gpt-image-1] +sdk = "openai_img_gen" model_type = "img_gen" model_id = "gpt-image-1" inputs = ["text"] outputs = ["image"] costs = { input = 0.04, output = 0.0 } - diff --git a/.pipelex/inference/backends/pipelex_gateway.toml b/.pipelex/inference/backends/pipelex_gateway.toml new file mode 100644 index 0000000..bca075b --- /dev/null +++ b/.pipelex/inference/backends/pipelex_gateway.toml @@ -0,0 +1,41 @@ +################################################################################ +# Pipelex Gateway Local Overrides +################################################################################ +# +# TELEMETRY NOTICE: +# +# Using Pipelex Gateway enables identified telemetry tied to your API key +# (hashed for security). This is independent from your telemetry.toml settings. +# +# We collect only technical data (model names, token counts, latency, error rates). +# We do NOT collect prompts, completions, pipe codes, or business data. +# +# This allows us to monitor service quality, enforce fair usage, and support you. +# +################################################################################ +# +# WARNING: USE AT YOUR OWN RISK! +# +# The actual model configuration is fetched remotely from Pipelex servers. +# Any override in this file may cause unexpected behavior or failures, +# as the remote configuration may change at any time. +# +# If you must override, you may ONLY use these keys per model: +# - sdk +# - structure_method +# +# All other keys will be ignored. +# +# If you need custom configurations, consider using your own API keys +# with direct provider backends (openai, anthropic, etc.) instead. +# +# Documentation: +# https://docs.pipelex.com/home/7-configuration/config-technical/inference-backend-config/ +# Support: https://go.pipelex.com/discord +# +################################################################################ + +# Per-model overrides example: +# [gpt-4o] +# sdk = "gateway_completions" +# structure_method = "instructor/openai_tools" diff --git a/.pipelex/inference/backends/pipelex_inference.toml b/.pipelex/inference/backends/pipelex_inference.toml index e516e27..751c570 100644 --- a/.pipelex/inference/backends/pipelex_inference.toml +++ b/.pipelex/inference/backends/pipelex_inference.toml @@ -24,6 +24,7 @@ model_type = "llm" sdk = "openai" prompting_target = "anthropic" +structure_method = "instructor/openai_tools" ################################################################################ # LANGUAGE MODELS @@ -35,54 +36,88 @@ model_id = "pipelex/gpt-4o" inputs = ["text", "images"] outputs = ["text", "structured"] costs = { input = 2.75, output = 11.00 } +sdk = "openai_responses" +structure_method = "instructor/openai_responses_tools" [gpt-4o-mini] model_id = "pipelex/gpt-4o-mini" inputs = ["text", "images"] outputs = ["text", "structured"] costs = { input = 0.17, output = 0.66 } +sdk = "openai_responses" +structure_method = "instructor/openai_responses_tools" ["gpt-4.1"] model_id = "pipelex/gpt-4.1" inputs = ["text", "images"] outputs = ["text", "structured"] costs = { input = 2, output = 8 } +sdk = "openai_responses" +structure_method = "instructor/openai_responses_tools" ["gpt-4.1-mini"] model_id = "pipelex/gpt-4.1-mini" inputs = ["text", "images"] outputs = ["text", "structured"] costs = { input = 0.4, output = 1.6 } +sdk = "openai_responses" +structure_method = "instructor/openai_responses_tools" ["gpt-4.1-nano"] model_id = "pipelex/gpt-4.1-nano" inputs = ["text", "images"] outputs = ["text", "structured"] costs = { input = 0.1, output = 0.4 } +sdk = "openai_responses" +structure_method = "instructor/openai_responses_tools" [gpt-5-nano] model_id = "pipelex/gpt-5-nano" inputs = ["text", "images"] outputs = ["text", "structured"] costs = { input = 0.05, output = 0.40 } +sdk = "openai_responses" +structure_method = "instructor/openai_responses_tools" [gpt-5-mini] model_id = "pipelex/gpt-5-mini" inputs = ["text", "images"] outputs = ["text", "structured"] costs = { input = 0.25, output = 2.00 } +sdk = "openai_responses" +structure_method = "instructor/openai_responses_tools" [gpt-5-chat] model_id = "pipelex/gpt-5-chat" inputs = ["text", "images"] outputs = ["text"] costs = { input = 1.25, output = 10.00 } +sdk = "openai_responses" +structure_method = "instructor/openai_responses_tools" [gpt-5] model_id = "pipelex/gpt-5" inputs = ["text", "images"] outputs = ["text"] costs = { input = 1.25, output = 10.00 } +sdk = "openai_responses" +structure_method = "instructor/openai_responses_tools" + +["gpt-5.1"] +model_id = "pipelex/gpt-5.1" +inputs = ["text", "images"] +outputs = ["text", "structured"] +costs = { input = 1.25, output = 10.00 } +sdk = "openai_responses" +structure_method = "instructor/openai_responses_tools" + +["gpt-5.1-chat"] +model_id = "pipelex/gpt-5.1-chat" +inputs = ["text", "images"] +outputs = ["text", "structured"] +costs = { input = 1.25, output = 10.00 } +sdk = "openai_responses" +structure_method = "instructor/openai_responses_tools" # --- Claude LLMs -------------------------------------------------------------- ["claude-4-sonnet"] @@ -109,32 +144,45 @@ inputs = ["text", "images"] outputs = ["text", "structured"] costs = { input = 1, output = 5 } +["claude-4.5-opus"] +model_id = "pipelex/claude-4.5-opus" +inputs = ["text", "images"] +outputs = ["text", "structured"] +costs = { input = 5, output = 25 } + # --- Gemini LLMs -------------------------------------------------------------- ["gemini-2.0-flash"] -model_id = "gemini/gemini-2.0-flash" +model_id = "pipelex/gemini-2.0-flash" inputs = ["text", "images"] outputs = ["text", "structured"] costs = { input = 0.10, output = 0.40 } ["gemini-2.5-pro"] -model_id = "gemini/gemini-2.5-pro" +model_id = "pipelex/gemini-2.5-pro" inputs = ["text", "images"] outputs = ["text", "structured"] max_prompt_images = 3000 costs = { input = 1.25, output = 10.0 } ["gemini-2.5-flash"] -model_id = "gemini/gemini-2.5-flash" +model_id = "pipelex/gemini-2.5-flash" inputs = ["text", "images"] outputs = ["text", "structured"] costs = { input = 0.30, output = 2.50 } ["gemini-2.5-flash-lite"] -model_id = "gemini/gemini-2.5-flash-lite" +model_id = "pipelex/gemini-2.5-flash-lite" inputs = ["text", "images"] outputs = ["text", "structured"] costs = { input = 0.10, output = 0.40 } +["gemini-3.0-pro"] +model_id = "pipelex/gemini-3.0-pro" +inputs = ["text", "images"] +outputs = ["text", "structured"] +max_prompt_images = 3000 +costs = { input = 2, output = 12.0 } + # --- XAI LLMs -------------------------------------------------------------- [grok-3] diff --git a/.pipelex/inference/backends/portkey.toml b/.pipelex/inference/backends/portkey.toml new file mode 100644 index 0000000..2080d0e --- /dev/null +++ b/.pipelex/inference/backends/portkey.toml @@ -0,0 +1,272 @@ +################################################################################ +# Portkey Configuration +################################################################################ +# +# This file defines the model specifications for the Portkey backend. +# It contains model definitions for various AI models. +# +# Configuration structure: +# - Each model is defined in its own section with the model name as the header +# - Headers with dots must be quoted (e.g., ["gpt-4.1"]) +# - Model costs are in USD per million tokens (input/output) +# +# Documentation: https://docs.pipelex.com +# Support: https://go.pipelex.com/discord +# +################################################################################ + +################################################################################ +# MODEL DEFAULTS +################################################################################ + +[defaults] +model_type = "llm" +sdk = "portkey_completions" +structure_method = "instructor/openai_tools" +prompting_target = "anthropic" + +################################################################################ +# LANGUAGE MODELS +################################################################################ + +# --- OpenAI LLMs -------------------------------------------------------------- +[gpt-4o-mini] +inputs = ["text", "images"] +outputs = ["text", "structured"] +costs = { input = 0.15, output = 0.6 } +sdk = "portkey_responses" +structure_method = "instructor/openai_responses_tools" +x-portkey-provider = "@openai" + +[gpt-4o] +inputs = ["text", "images"] +outputs = ["text", "structured"] +costs = { input = 2.5, output = 10.0 } +sdk = "portkey_responses" +structure_method = "instructor/openai_responses_tools" +x-portkey-provider = "@openai" + +["gpt-4.1-nano"] +inputs = ["text", "images"] +outputs = ["text", "structured"] +costs = { input = 0.1, output = 0.4 } +sdk = "portkey_responses" +structure_method = "instructor/openai_responses_tools" +x-portkey-provider = "@openai" + +["gpt-4.1-mini"] +inputs = ["text", "images"] +outputs = ["text", "structured"] +costs = { input = 0.4, output = 1.6 } +sdk = "portkey_responses" +structure_method = "instructor/openai_responses_tools" +x-portkey-provider = "@openai" + +["gpt-4.1"] +inputs = ["text", "images"] +outputs = ["text", "structured"] +costs = { input = 2, output = 8 } +sdk = "portkey_responses" +structure_method = "instructor/openai_responses_tools" +x-portkey-provider = "@openai" + +[o1] +inputs = ["text", "images"] +outputs = ["text", "structured"] +costs = { input = 15.0, output = 60.0 } +valued_constraints = { fixed_temperature = 1 } +sdk = "portkey_responses" +structure_method = "instructor/openai_responses_tools" +x-portkey-provider = "@openai" + +[o3-mini] +inputs = ["text"] +outputs = ["text", "structured"] +costs = { input = 1.1, output = 4.4 } +valued_constraints = { fixed_temperature = 1 } +sdk = "portkey_responses" +structure_method = "instructor/openai_responses_tools" +x-portkey-provider = "@openai" + +[o3] +inputs = ["text", "images"] +outputs = ["text", "structured"] +costs = { input = 2, output = 8 } +valued_constraints = { fixed_temperature = 1 } +sdk = "portkey_responses" +structure_method = "instructor/openai_responses_tools" +x-portkey-provider = "@openai" + +[o4-mini] +inputs = ["text", "images"] +outputs = ["text", "structured"] +costs = { input = 1.1, output = 4.4 } +valued_constraints = { fixed_temperature = 1 } +sdk = "portkey_responses" +structure_method = "instructor/openai_responses_tools" +x-portkey-provider = "@openai" + +[gpt-5-nano] +inputs = ["text", "images"] +outputs = ["text", "structured"] +costs = { input = 0.05, output = 0.4 } +valued_constraints = { fixed_temperature = 1 } +sdk = "portkey_responses" +structure_method = "instructor/openai_responses_tools" +x-portkey-provider = "@openai" + +[gpt-5-mini] +inputs = ["text", "images"] +outputs = ["text", "structured"] +costs = { input = 0.25, output = 2.0 } +valued_constraints = { fixed_temperature = 1 } +sdk = "portkey_responses" +structure_method = "instructor/openai_responses_tools" +x-portkey-provider = "@openai" + +[gpt-5] +inputs = ["text", "images"] +outputs = ["text", "structured"] +costs = { input = 1.25, output = 10.0 } +valued_constraints = { fixed_temperature = 1 } +sdk = "portkey_responses" +structure_method = "instructor/openai_responses_tools" +x-portkey-provider = "@openai" + +["gpt-5.1"] +inputs = ["text", "images"] +outputs = ["text", "structured"] +costs = { input = 1.25, output = 10.0 } +valued_constraints = { fixed_temperature = 1 } +sdk = "portkey_responses" +structure_method = "instructor/openai_responses_tools" +x-portkey-provider = "@openai" + +["gpt-5.1-codex"] +inputs = ["text", "images"] +outputs = ["text", "structured"] +costs = { input = 1.25, output = 10.0 } +valued_constraints = { fixed_temperature = 1 } +sdk = "portkey_responses" +structure_method = "instructor/openai_responses_tools" +x-portkey-provider = "@openai" + +# --- Claude LLMs -------------------------------------------------------------- +[claude-3-haiku] +model_id = "claude-3-haiku-20240307" +max_tokens = 4096 +inputs = ["text", "images"] +outputs = ["text", "structured"] +max_prompt_images = 100 +costs = { input = 0.25, output = 1.25 } +x-portkey-provider = "@anthropic" + +[claude-3-opus] +model_id = "claude-3-opus-20240229" +max_tokens = 4096 +inputs = ["text", "images"] +outputs = ["text", "structured"] +max_prompt_images = 100 +costs = { input = 15.0, output = 75.0 } +x-portkey-provider = "@anthropic" + +["claude-3.7-sonnet"] +model_id = "claude-3-7-sonnet-20250219" +max_tokens = 8192 +inputs = ["text", "images"] +outputs = ["text", "structured"] +max_prompt_images = 100 +costs = { input = 3.0, output = 15.0 } +x-portkey-provider = "@anthropic" + +[claude-4-sonnet] +model_id = "claude-sonnet-4-20250514" +max_tokens = 64000 +inputs = ["text", "images"] +outputs = ["text", "structured"] +max_prompt_images = 100 +costs = { input = 3.0, output = 15.0 } +x-portkey-provider = "@anthropic" + +[claude-4-opus] +model_id = "claude-opus-4-20250514" +max_tokens = 32000 +inputs = ["text", "images"] +outputs = ["text", "structured"] +max_prompt_images = 100 +costs = { input = 3.0, output = 15.0 } +x-portkey-provider = "@anthropic" + +["claude-4.1-opus"] +model_id = "claude-opus-4-1-20250805" +max_tokens = 32000 +inputs = ["text", "images"] +outputs = ["text", "structured"] +max_prompt_images = 100 +costs = { input = 3.0, output = 15.0 } +x-portkey-provider = "@anthropic" + +["claude-4.5-sonnet"] +model_id = "claude-sonnet-4-5-20250929" +max_tokens = 64000 +inputs = ["text", "images"] +outputs = ["text", "structured"] +max_prompt_images = 100 +costs = { input = 3.0, output = 15.0 } +x-portkey-provider = "@anthropic" + +["claude-4.5-haiku"] +model_id = "claude-haiku-4-5-20251001" +max_tokens = 64000 +inputs = ["text", "images"] +outputs = ["text", "structured"] +max_prompt_images = 100 +costs = { input = 1.0, output = 5.0 } +x-portkey-provider = "@anthropic" + +["claude-4.5-opus"] +model_id = "claude-opus-4-5-20251101" +max_tokens = 64000 +inputs = ["text", "images"] +outputs = ["text", "structured"] +max_prompt_images = 100 +costs = { input = 5.0, output = 25.0 } +x-portkey-provider = "@anthropic" + +# --- Gemini LLMs -------------------------------------------------------------- +["gemini-2.0-flash"] +model_id = "gemini-2.0-flash" +inputs = ["text", "images"] +outputs = ["text", "structured"] +costs = { input = 0.10, output = 0.40 } +x-portkey-provider = "@google" + +["gemini-2.5-pro"] +model_id = "gemini-2.5-pro" +inputs = ["text", "images"] +outputs = ["text", "structured"] +max_prompt_images = 3000 +costs = { input = 1.25, output = 10.0 } +x-portkey-provider = "@google" + +["gemini-2.5-flash"] +model_id = "gemini-2.5-flash" +inputs = ["text", "images"] +outputs = ["text", "structured"] +costs = { input = 0.30, output = 2.50 } +x-portkey-provider = "@google" + +["gemini-2.5-flash-lite"] +model_id = "gemini-2.5-flash-lite" +inputs = ["text", "images"] +outputs = ["text", "structured"] +costs = { input = 0.10, output = 0.40 } +x-portkey-provider = "@google" + +["gemini-3.0-pro"] +model_id = "gemini-3-pro-preview" +inputs = ["text", "images"] +outputs = ["text", "structured"] +max_prompt_images = 3000 +costs = { input = 2, output = 12.0 } +x-portkey-provider = "@google" diff --git a/.pipelex/inference/backends/scaleway.toml b/.pipelex/inference/backends/scaleway.toml new file mode 100644 index 0000000..20fe792 --- /dev/null +++ b/.pipelex/inference/backends/scaleway.toml @@ -0,0 +1,67 @@ +################################################################################ +# Groq Backend Configuration +################################################################################ +# +# This file defines the model specifications for Scaleway models. +# It contains model definitions for various LLM models accessible through +# the Groq API, including text-only and vision-capable models. +# +# Configuration structure: +# - Each model is defined in its own section with the model name as the header +# - Headers with dots or slashes must be quoted (e.g., ["meta-llama/llama-4-scout"]) +# - Model costs are in USD per million tokens (input/output) +# - Vision models support max 5 images per request, 33MP max resolution +# +# Documentation: https://docs.pipelex.com +# Support: https://go.pipelex.com/discord +# +################################################################################ + +################################################################################ +# MODEL DEFAULTS +################################################################################ + +[defaults] +model_type = "llm" +sdk = "openai" +structure_method = "instructor/json" + +# --- DeepSeek Models ---------------------------------------------------------- +[deepseek-r1-distill-llama-70b] +max_tokens = 32768 +inputs = ["text"] +outputs = ["text", "structured"] +costs = { input = 0.90, output = 0.90 } + +# --- Meta Llama 3.x Series ---------------------------------------------------- +["llama-3.1-8b-instruct"] +max_tokens = 131072 +inputs = ["text"] +outputs = ["text", "structured"] +costs = { input = 0.2, output = 0.2 } + +["llama-3.3-70b-instruct"] +max_tokens = 32768 +inputs = ["text"] +outputs = ["text", "structured"] +costs = { input = 0.90, output = 0.90 } + +# --- OpenAI GPT-OSS Models ---------------------------------------------------- +[gpt-oss-120b] +max_tokens = 65536 +inputs = ["text"] +outputs = ["text", "structured"] +costs = { input = 0.15, output = 0.60 } + +# --- Qwen 3 ------------------------------------------------------------------- +[qwen3-235b-a22b-instruct-2507] +max_tokens = 40960 +inputs = ["text"] +outputs = ["text", "structured"] +costs = { input = 0.75, output = 2.25 } + +[qwen3-coder-30b-a3b-instruct] +max_tokens = 40960 +inputs = ["text"] +outputs = ["text", "structured"] +costs = { input = 0.20, output = 0.80 } diff --git a/.pipelex/inference/backends/vertexai.toml b/.pipelex/inference/backends/vertexai.toml index da01d4f..1ebab79 100644 --- a/.pipelex/inference/backends/vertexai.toml +++ b/.pipelex/inference/backends/vertexai.toml @@ -24,6 +24,7 @@ model_type = "llm" sdk = "openai" prompting_target = "gemini" +structure_method = "instructor/vertexai_tools" ################################################################################ # LANGUAGE MODELS @@ -50,4 +51,4 @@ model_id = "google/gemini-2.5-flash" inputs = ["text", "images"] outputs = ["text", "structured"] max_prompt_images = 3000 -costs = { input = 0.30, output = 2.50 } \ No newline at end of file +costs = { input = 0.30, output = 2.50 } diff --git a/.pipelex/inference/backends/xai.toml b/.pipelex/inference/backends/xai.toml index a4bb433..3045344 100644 --- a/.pipelex/inference/backends/xai.toml +++ b/.pipelex/inference/backends/xai.toml @@ -24,6 +24,7 @@ model_type = "llm" sdk = "openai" prompting_target = "anthropic" +structure_method = "instructor/openai_tools" ################################################################################ # LANGUAGE MODELS @@ -53,4 +54,3 @@ model_id = "grok-3-mini-fast-latest" inputs = ["text"] outputs = ["text"] costs = { input = 0.15, output = 4 } - diff --git a/.pipelex/inference/deck/base_deck.toml b/.pipelex/inference/deck/base_deck.toml index 12f5167..d7dba4d 100644 --- a/.pipelex/inference/deck/base_deck.toml +++ b/.pipelex/inference/deck/base_deck.toml @@ -21,16 +21,21 @@ base-claude = "claude-4.5-sonnet" base-gpt = "gpt-4o" base-gemini = "gemini-2.5-flash" base-mistral = "mistral-medium" +base-groq = "llama-3.3-70b-versatile" +base-grok = "grok-4-fast-non-reasoning" -best-gpt = "gpt-4o" -best-claude = "claude-4.1-opus" -best-gemini = "gemini-2.5-pro" +best-gpt = "gpt-5.1" +best-claude = "claude-4.5-opus" +best-gemini = "gemini-3.0-pro" best-mistral = "mistral-medium" -best-grok = "grok-3" + +# Groq-specific aliases +fast-groq = "llama-3.1-8b-instant" +vision-groq = "llama-4-scout-17b-16e-instruct" # Image generation aliases base-img-gen = "flux-pro/v1.1" -best-img-gen = "flux-pro/v1.1-ultra" +best-img-gen = "flux-2" fast-img-gen = "fast-lightning-sdxl" #################################################################################################### @@ -38,35 +43,49 @@ fast-img-gen = "fast-lightning-sdxl" #################################################################################################### [waterfalls] + +# --- Waterfalls for LLMs --------------------------------------------------------------------- smart_llm = [ + "claude-4.5-opus", "claude-4.5-sonnet", + "gemini-3.0-pro", + "gpt-5.1", "claude-4.1-opus", - "claude-4-sonnet", "gemini-2.5-pro", - "gpt-4o", # we're using gpt-4o here rather than gpt-5 until we'll handle reasoning parameters - "grok-3", + "claude-4-sonnet", + "grok-4", ] smart_llm_with_vision = [ + "claude-4.5-opus", "claude-4.5-sonnet", + "gemini-3.0-pro", + "gpt-5.1", "claude-4.1-opus", - "claude-4-sonnet", "gemini-2.5-pro", - "gpt-4o", # we use gpt-4o here rather than gpt-5 until we handle reasoning parameters - "grok-3", + "claude-4-sonnet", + "grok-4", ] smart_llm_for_structured = [ + "claude-4.5-opus", "claude-4.5-sonnet", + "gemini-3.0-pro", + "gpt-5.1", "claude-4.1-opus", "claude-4-sonnet", - "gpt-4o", # we use gpt-4o here rather than gpt-5 until we handle reasoning parameters ] - +llm_for_creativity = [ + "claude-4.5-opus", + "claude-4.1-opus", + "gemini-2.5-pro", + "gpt-5.1", +] llm_for_large_codebase = [ "gemini-2.5-pro", "claude-4.5-sonnet", - "gpt-4o", # we use gpt-4o here rather than gpt-5 until we handle reasoning parameters + "gemini-3.0-pro", + "gpt-5.1", "gemini-2.5-flash", - "grok-3", + "grok-4", ] cheap_llm = [ "gpt-4o-mini", @@ -76,11 +95,21 @@ cheap_llm = [ "grok-3-mini", ] cheap_llm_for_vision = [ - "gpt-4o-mini", "gemini-2.5-flash-lite", + "gpt-4o-mini", "claude-3-haiku", ] cheap_llm_for_structured = ["gpt-4o-mini", "mistral-small", "claude-3-haiku"] +cheap_llm_for_creativity = [ + "gemini-2.5-flash", + "grok-3-mini", + "gpt-4o-mini", + "claude-4.5-haiku", +] + +# --- Waterfalls for Extracts --------------------------------------------------------------------- +pdf_text_extractor = ["mistral-ocr", "pypdfium2-extract-text"] +image_text_extractor = ["mistral-ocr"] #################################################################################################### # LLM Presets @@ -108,17 +137,22 @@ llm_to_analyze_large_codebase = { model = "base-claude", temperature = 0.1 } # Vision skills llm_for_img_to_text_cheap = { model = "gpt-4o-mini", temperature = 0.1 } llm_for_img_to_text = { model = "base-claude", temperature = 0.1 } -llm_for_diagram_to_text = { model = "base-claude", temperature = 0.3 } +llm_for_diagram_to_text = { model = "best-claude", temperature = 0.3 } llm_for_table_to_text = { model = "base-claude", temperature = 0.3 } # Image generation prompting skills llm_to_prompt_img_gen = { model = "base-claude", temperature = 0.2 } llm_to_prompt_img_gen_cheap = { model = "gpt-4o-mini", temperature = 0.5 } +# Groq-specific presets (fast inference, low cost) +llm_groq_fast_text = { model = "fast-groq", temperature = 0.7 } +llm_groq_balanced = { model = "base-groq", temperature = 0.5 } +llm_groq_vision = { model = "vision-groq", temperature = 0.3 } + # LLM Presets — For Testing --------------------------------------------------------------------- llm_for_testing_gen_text = { model = "cheap_llm", temperature = 0.5 } -llm_for_testing_gen_object = { model = "cheap_llm_for_structured", temperature = 0.5 } +llm_for_testing_gen_object = { model = "cheap_llm_for_structured", temperature = 0.1 } llm_for_testing_vision = { model = "cheap_llm_for_vision", temperature = 0.5 } llm_for_testing_vision_structured = { model = "cheap_llm_for_vision", temperature = 0.5 } @@ -154,6 +188,7 @@ choice_default = "gen_image_basic" gen_image_basic = { model = "base-img-gen", quality = "medium", guidance_scale = 7.5, is_moderated = true, safety_tolerance = 3 } gen_image_fast = { model = "fast-img-gen", nb_steps = 4, guidance_scale = 5.0, is_moderated = true, safety_tolerance = 3 } gen_image_high_quality = { model = "best-img-gen", quality = "high", guidance_scale = 8.0, is_moderated = true, safety_tolerance = 3 } +gen_image_openai_low_quality = { model = "gpt-image-1", quality = "low" } # Specific skills img_gen_for_art = { model = "best-img-gen", quality = "high", guidance_scale = 9.0, is_moderated = false, safety_tolerance = 5 } @@ -161,3 +196,5 @@ img_gen_for_diagram = { model = "base-img-gen", quality = "medium", guidance_sca img_gen_for_mockup = { model = "base-img-gen", quality = "medium", guidance_scale = 6.5, is_moderated = true, safety_tolerance = 3 } img_gen_for_product = { model = "best-img-gen", quality = "high", guidance_scale = 8.5, is_moderated = true, safety_tolerance = 2 } img_gen_for_testing = { model = "fast-img-gen", nb_steps = 4, guidance_scale = 4.0, is_moderated = true, safety_tolerance = 4 } + +# img_gen_for_testing = { model = "nano-banana" } diff --git a/.pipelex/inference/routing_profiles.toml b/.pipelex/inference/routing_profiles.toml index 47e87ec..76891b0 100644 --- a/.pipelex/inference/routing_profiles.toml +++ b/.pipelex/inference/routing_profiles.toml @@ -9,7 +9,7 @@ # ========================================================================================= # Which profile to use (change this to switch routing) -active = "pipelex_first" +active = "pipelex_gateway_first" # We recommend using the "pipelex_first" profile to get a head start with all models. # The Pipelex Inference backend is currently not recommended for production use, @@ -25,31 +25,6 @@ active = "pipelex_first" # Routing Profiles # ========================================================================================= -[profiles.pipelex_first] -description = "Use Pipelex Inference backend for all its supported models" -default = "pipelex_inference" -fallback_order = [ - "pipelex_inference", - "azure_openai", - "bedrock", - "blackboxai", - "mistral", - "fal", -] - -[profiles.pipelex_first.routes] -# Pattern matching: "model-pattern" = "backend-name" -"gpt-*" = "pipelex_inference" -"claude-*" = "pipelex_inference" -"grok-*" = "pipelex_inference" -"gemini-*" = "pipelex_inference" - -[profiles.pipelex_first.optional_routes] # Each optional route is considered only if its backend is available -"*-sdxl" = "fal" -"flux-*" = "fal" -"gpt-image-1" = "openai" -"mistral-ocr" = "mistral" - [profiles.all_anthropic] description = "Use Anthropic backend for all its supported models" default = "anthropic" @@ -149,3 +124,22 @@ description = "Example routing profile using specific models" "grok-3" = "pipelex_inference" "grok-3-mini" = "xai" + +[profiles.pipelex_gateway_first] +description = "Use Pipelex Inference backend for all its supported models" +default = "pipelex_gateway" +fallback_order = ["pipelex_gateway", "azure_openai", "bedrock", "blackboxai", "mistral", "fal"] + +[profiles.pipelex_gateway_first.routes] +# Pattern matching: "model-pattern" = "backend-name" +"gpt-*" = "pipelex_gateway" +"claude-*" = "pipelex_gateway" +"grok-*" = "pipelex_gateway" +"gemini-*" = "pipelex_gateway" + +[profiles.pipelex_gateway_first.optional_routes] # Each optional route is considered only if its backend is available +"*-sdxl" = "fal" +"flux-*" = "fal" +"gpt-image-1" = "openai" +"mistral-ocr" = "mistral" + diff --git a/.pipelex/pipelex_service.toml b/.pipelex/pipelex_service.toml new file mode 100644 index 0000000..afe39a2 --- /dev/null +++ b/.pipelex/pipelex_service.toml @@ -0,0 +1,19 @@ +#################################################################################################### +# Pipelex Service Configuration +#################################################################################################### +# +# This file stores settings related to Pipelex managed services. +# Currently used for Pipelex Gateway terms acceptance. +# +# Documentation: https://docs.pipelex.com +# Support: https://go.pipelex.com/discord +# +#################################################################################################### + +[agreement] +# Set to true after accepting Pipelex terms of service. +terms_accepted = true + +# Note: when using pipelex_gateway, telemetry is enabled to monitor service usage. +# We collect technical data (model, pipe type...) and quantitative data (token counts...) +# but NOT your content, pipe codes, or output class names. diff --git a/.pipelex/telemetry.toml b/.pipelex/telemetry.toml index 9fd46cd..75207a7 100644 --- a/.pipelex/telemetry.toml +++ b/.pipelex/telemetry.toml @@ -1,21 +1,84 @@ #################################################################################################### -# Pipelex Telemetry Configuration +# Custom Telemetry Configuration #################################################################################################### # -# This file controls telemetry settings for Pipelex usage analytics. -# Telemetry helps us improve Pipelex by understanding how it's used. +# This file controls YOUR custom telemetry settings for observability and analytics. +# Configure your own PostHog, Langfuse, or OTLP-compatible backends here. +# +# NOTE: When using Pipelex Gateway, identified telemetry is automatically enabled +# (tied to your Gateway API key, hashed for security). This allows us to monitor +# service quality, enforce fair usage, and provide you with better support. +# Gateway telemetry operates independently from your settings below - you can have both! +# +# To disable all telemetry, set the DO_NOT_TRACK=1 environment variable. # # Documentation: https://docs.pipelex.com # Support: https://go.pipelex.com/discord # #################################################################################################### -telemetry_mode = "anonymous" # Values: "off" | "anonymous" | "identified" -host = "https://eu.i.posthog.com" -project_api_key = "phc_HPJnNKpIXh0SxNDYyTAyUtnq9KxNNZJWQszynsWVx4Y" -respect_dnt = true -redact = ["prompt", "system_prompt", "response", "file_path", "url"] -geoip_enabled = true -dry_mode_enabled = false -verbose_enabled = false -user_id = "" +# ────────────────────────────────────────────────────────────────────────────── +# PostHog Configuration (Event tracking + AI span tracing) +# ────────────────────────────────────────────────────────────────────────────── + +[posthog] +mode = "off" # Values: "off" | "anonymous" | "identified" +# user_id = "your_user_id" # Required when mode = "identified" +endpoint = "${POSTHOG_ENDPOINT}" # Default: https://us.i.posthog.com (or https://eu.i.posthog.com for EU) +api_key = "${POSTHOG_API_KEY}" # Get from PostHog Project Settings +geoip = true # Enable GeoIP lookup +debug = false # Enable PostHog debug mode +redact_properties = [ + "prompt", + "system_prompt", + "response", + "file_path", + "url", +] # Event properties to redact + +# AI span tracing to YOUR PostHog (does NOT affect Langfuse/OTLP - they receive full data) +[posthog.tracing] +enabled = false # Send AI spans to your PostHog + +# Privacy controls for data sent to YOUR PostHog only +[posthog.tracing.capture] +content = false # Capture prompt/completion content +# content_max_length = 1000 # Max length for captured content (omit for unlimited) +pipe_codes = false # Include pipe codes in span names/attributes +output_class_names = false # Include output class names in span names/attributes + +# ────────────────────────────────────────────────────────────────────────────── +# Langfuse Integration +# Note: Langfuse receives FULL span data (no redaction) +# ────────────────────────────────────────────────────────────────────────────── + +[langfuse] +enabled = false +# endpoint = "https://cloud.langfuse.com" # Override for self-hosted Langfuse +# public_key = "${LANGFUSE_PUBLIC_KEY}" # Langfuse public key +# secret_key = "${LANGFUSE_SECRET_KEY}" # Langfuse secret key + +# ────────────────────────────────────────────────────────────────────────────── +# Additional OTLP Exporters (array for multiple) +# Note: OTLP exporters receive FULL span data (no redaction) +# ────────────────────────────────────────────────────────────────────────────── + +# [[otlp]] +# name = "my-collector" # Identifier for logging +# endpoint = "https://..." # OTLP endpoint URL +# headers = { Authorization = "Bearer ${OTLP_AUTH_TOKEN}" } # Headers for OTLP export + +# ────────────────────────────────────────────────────────────────────────────── +# Custom Telemetry Allowed Modes +# Controls which integration modes can use custom telemetry settings above. +# ────────────────────────────────────────────────────────────────────────────── + +[custom_telemetry_allowed_modes] +ci = false # CI environments don't use custom telemetry +cli = true # CLI usage allows custom telemetry +docker = true # Docker deployments allow custom telemetry +fastapi = true # FastAPI integrations allow custom telemetry +mcp = true # MCP integrations allow custom telemetry +n8n = true # n8n integrations allow custom telemetry +pytest = false # Tests don't use custom telemetry +python = false # Direct Python SDK usage doesn't use custom telemetry by default diff --git a/cocode/exceptions.py b/cocode/exceptions.py index 06e4afd..f0a44d0 100644 --- a/cocode/exceptions.py +++ b/cocode/exceptions.py @@ -1,13 +1,16 @@ -from pipelex.base_exceptions import RootException +class CocodeError(Exception): + def __init__(self, message: str): + super().__init__(message) + self.message = message -class PythonProcessingError(RootException): +class PythonProcessingError(CocodeError): pass -class RepoxException(RootException): +class RepoxException(CocodeError): pass -class NoDifferencesFound(RootException): +class NoDifferencesFound(CocodeError): pass diff --git a/cocode/repox/repox_processor.py b/cocode/repox/repox_processor.py index c670a17..20adf97 100644 --- a/cocode/repox/repox_processor.py +++ b/cocode/repox/repox_processor.py @@ -16,7 +16,7 @@ import pathspec from pathspec import PathSpec from pipelex import log -from pipelex.tools.misc.filetype_utils import FileType, FileTypeException +from pipelex.tools.misc.filetype_utils import FileType, FileTypeError from cocode.exceptions import RepoxException from cocode.repox.models import OutputStyle @@ -257,7 +257,7 @@ def process_file_contents(self) -> Dict[str, str]: # text file file_type, text_if_applicable = file_check file_content = self._specific_text_file_processing(file_type=file_type, text=text_if_applicable) - except FileTypeException as exc: + except FileTypeError as exc: log.warning(f"Skipping '{file_path}' - could not determine file type: {exc}") continue file_contents[relative_path] = file_content diff --git a/cocode/validation_cli.py b/cocode/validation_cli.py index e633e30..988feb8 100644 --- a/cocode/validation_cli.py +++ b/cocode/validation_cli.py @@ -6,6 +6,7 @@ import typer from pipelex import log +from pipelex.cli.commands.validate_cmd import do_validate_all_libraries_and_dry_run from pipelex.hub import get_pipes from pipelex.pipe_run.dry_run import dry_run_pipes from pipelex.pipelex import Pipelex @@ -21,7 +22,7 @@ @validation_app.command("validate") def validate() -> None: """Run the setup sequence and validate all pipelines.""" - Pipelex.get_instance().validate_libraries() + do_validate_all_libraries_and_dry_run() asyncio.run(dry_run_pipes(get_pipes())) log.info("Setup sequence passed OK, config and pipelines are validated.") @@ -36,5 +37,5 @@ def dry_run() -> None: @validation_app.command("check-config") def check_config() -> None: """Validate Pipelex configuration and libraries.""" - Pipelex.get_instance().validate_libraries() + do_validate_all_libraries_and_dry_run() log.info("Configuration validation passed OK.") diff --git a/pyproject.toml b/pyproject.toml index 3322922..71be215 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,6 +21,9 @@ dependencies = [ "PyGithub==2.4.0", ] +[tool.uv.sources] +pipelex = { path = "../pipelex", editable = true } + [project.optional-dependencies] docs = [ "mkdocs==1.6.1", diff --git a/tests/conftest.py b/tests/conftest.py index c82cd91..a78f630 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -3,8 +3,13 @@ import pipelex.config import pipelex.pipelex import pytest +from pipelex.cli.cli_factory import make_pipelex_for_cli +from pipelex.cli.commands.validate_cmd import do_validate_all_libraries_and_dry_run +from pipelex.cli.error_handlers import ErrorContext from pipelex.config import get_config +from pipelex.pipelex import Pipelex from pipelex.system.configuration.config_check import check_is_initialized +from pipelex.system.configuration.configs import PipelexConfig from rich import print from rich.console import Console from rich.traceback import Traceback @@ -26,17 +31,17 @@ def reset_pipelex_config_fixture(): # Code to run before each test print("\n[magenta]pipelex setup[/magenta]") try: - pipelex_instance = pipelex.pipelex.Pipelex.make() - pipelex_instance.validate_libraries() + make_pipelex_for_cli(context=ErrorContext.VALIDATION) + do_validate_all_libraries_and_dry_run() config = get_config() - assert isinstance(config, pipelex.config.PipelexConfig) + assert isinstance(config, PipelexConfig) except Exception as exc: Console().print(Traceback()) pytest.exit(f"Critical Pipelex setup error: {exc}") yield # Code to run after each test print("\n[magenta]pipelex teardown[/magenta]") - pipelex_instance.teardown() + Pipelex.teardown_if_needed() @pytest.fixture(scope="function", autouse=True) diff --git a/uv.lock b/uv.lock index 1ea32a1..d926a0b 100644 --- a/uv.lock +++ b/uv.lock @@ -336,6 +336,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/cc/84/06490071e26bab22ac79a684e98445df118adcf80c58c33ba5af184030f2/botocore_stubs-1.38.46-py3-none-any.whl", hash = "sha256:cc21d9a7dd994bdd90872db4664d817c4719b51cda8004fd507a4bf65b085a75", size = 66083, upload-time = "2025-06-29T22:58:22.234Z" }, ] +[[package]] +name = "cached-property" +version = "2.0.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/76/4b/3d870836119dbe9a5e3c9a61af8cc1a8b69d75aea564572e385882d5aefb/cached_property-2.0.1.tar.gz", hash = "sha256:484d617105e3ee0e4f1f58725e72a8ef9e93deee462222dbd51cd91230897641", size = 10574, upload-time = "2024-10-25T15:43:55.667Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/11/0e/7d8225aab3bc1a0f5811f8e1b557aa034ac04bdf641925b30d3caf586b28/cached_property-2.0.1-py3-none-any.whl", hash = "sha256:f617d70ab1100b7bcf6e42228f9ddcb78c676ffa167278d9f730d1c2fba69ccb", size = 7428, upload-time = "2024-10-25T15:43:54.711Z" }, +] + [[package]] name = "cachetools" version = "5.5.2" @@ -551,7 +560,7 @@ requires-dist = [ { name = "mkdocs-material", marker = "extra == 'docs'", specifier = "==9.6.14" }, { name = "mkdocs-meta-manager", marker = "extra == 'docs'", specifier = "==1.1.0" }, { name = "mypy", marker = "extra == 'dev'", specifier = ">=1.11.2" }, - { name = "pipelex", extras = ["anthropic", "google", "google-genai", "bedrock"], specifier = "==0.15.7" }, + { name = "pipelex", extras = ["anthropic", "google", "google-genai", "bedrock"], editable = "../pipelex" }, { name = "pygithub", specifier = "==2.4.0" }, { name = "pyright", marker = "extra == 'dev'", specifier = ">=1.1.405" }, { name = "pytest", marker = "extra == 'dev'", specifier = ">=9.0.1" }, @@ -927,6 +936,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/15/14/e5e8fbca8863fee718208566c4e927b8e9f45fd46ec5cf89e24759da545b/google_genai-1.41.0-py3-none-any.whl", hash = "sha256:111a3ee64c1a0927d3879faddb368234594432479a40c311e5fe4db338ca8778", size = 245931, upload-time = "2025-10-02T22:30:27.885Z" }, ] +[[package]] +name = "googleapis-common-protos" +version = "1.72.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "protobuf" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/e5/7b/adfd75544c415c487b33061fe7ae526165241c1ea133f9a9125a56b39fd8/googleapis_common_protos-1.72.0.tar.gz", hash = "sha256:e55a601c1b32b52d7a3e65f43563e2aa61bcd737998ee672ac9b951cd49319f5", size = 147433, upload-time = "2025-11-06T18:29:24.087Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c4/ab/09169d5a4612a5f92490806649ac8d41e3ec9129c636754575b3553f4ea4/googleapis_common_protos-1.72.0-py3-none-any.whl", hash = "sha256:4299c5a82d5ae1a9702ada957347726b167f9f8d1fc352477702a1e851ff4038", size = 297515, upload-time = "2025-11-06T18:29:13.14Z" }, +] + [[package]] name = "h11" version = "0.16.0" @@ -973,6 +994,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3", size = 70442, upload-time = "2024-09-15T18:07:37.964Z" }, ] +[[package]] +name = "importlib-metadata" +version = "8.7.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "zipp" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/76/66/650a33bd90f786193e4de4b3ad86ea60b53c89b669a5c7be931fac31cdb0/importlib_metadata-8.7.0.tar.gz", hash = "sha256:d13b81ad223b890aa16c5471f2ac3056cf76c5f10f82d6f9292f0b415f389000", size = 56641, upload-time = "2025-04-27T15:29:01.736Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/20/b0/36bd937216ec521246249be3bf9855081de4c5e06a0c9b4219dbeda50373/importlib_metadata-8.7.0-py3-none-any.whl", hash = "sha256:e5dd1551894c77868a30651cef00984d50e1002d06942a7101d34870c5f02afd", size = 27656, upload-time = "2025-04-27T15:29:00.214Z" }, +] + [[package]] name = "iniconfig" version = "2.1.0" @@ -1648,6 +1681,88 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/1d/2a/7dd3d207ec669cacc1f186fd856a0f61dbc255d24f6fdc1a6715d6051b0f/openai-1.109.1-py3-none-any.whl", hash = "sha256:6bcaf57086cf59159b8e27447e4e7dd019db5d29a438072fbd49c290c7e65315", size = 948627, upload-time = "2025-09-24T13:00:50.754Z" }, ] +[[package]] +name = "opentelemetry-api" +version = "1.39.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "importlib-metadata" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/97/b9/3161be15bb8e3ad01be8be5a968a9237c3027c5be504362ff800fca3e442/opentelemetry_api-1.39.1.tar.gz", hash = "sha256:fbde8c80e1b937a2c61f20347e91c0c18a1940cecf012d62e65a7caf08967c9c", size = 65767, upload-time = "2025-12-11T13:32:39.182Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/cf/df/d3f1ddf4bb4cb50ed9b1139cc7b1c54c34a1e7ce8fd1b9a37c0d1551a6bd/opentelemetry_api-1.39.1-py3-none-any.whl", hash = "sha256:2edd8463432a7f8443edce90972169b195e7d6a05500cd29e6d13898187c9950", size = 66356, upload-time = "2025-12-11T13:32:17.304Z" }, +] + +[[package]] +name = "opentelemetry-exporter-otlp-proto-common" +version = "1.39.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "opentelemetry-proto" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/e9/9d/22d241b66f7bbde88a3bfa6847a351d2c46b84de23e71222c6aae25c7050/opentelemetry_exporter_otlp_proto_common-1.39.1.tar.gz", hash = "sha256:763370d4737a59741c89a67b50f9e39271639ee4afc999dadfe768541c027464", size = 20409, upload-time = "2025-12-11T13:32:40.885Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8c/02/ffc3e143d89a27ac21fd557365b98bd0653b98de8a101151d5805b5d4c33/opentelemetry_exporter_otlp_proto_common-1.39.1-py3-none-any.whl", hash = "sha256:08f8a5862d64cc3435105686d0216c1365dc5701f86844a8cd56597d0c764fde", size = 18366, upload-time = "2025-12-11T13:32:20.2Z" }, +] + +[[package]] +name = "opentelemetry-exporter-otlp-proto-http" +version = "1.39.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "googleapis-common-protos" }, + { name = "opentelemetry-api" }, + { name = "opentelemetry-exporter-otlp-proto-common" }, + { name = "opentelemetry-proto" }, + { name = "opentelemetry-sdk" }, + { name = "requests" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/80/04/2a08fa9c0214ae38880df01e8bfae12b067ec0793446578575e5080d6545/opentelemetry_exporter_otlp_proto_http-1.39.1.tar.gz", hash = "sha256:31bdab9745c709ce90a49a0624c2bd445d31a28ba34275951a6a362d16a0b9cb", size = 17288, upload-time = "2025-12-11T13:32:42.029Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/95/f1/b27d3e2e003cd9a3592c43d099d2ed8d0a947c15281bf8463a256db0b46c/opentelemetry_exporter_otlp_proto_http-1.39.1-py3-none-any.whl", hash = "sha256:d9f5207183dd752a412c4cd564ca8875ececba13be6e9c6c370ffb752fd59985", size = 19641, upload-time = "2025-12-11T13:32:22.248Z" }, +] + +[[package]] +name = "opentelemetry-proto" +version = "1.39.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "protobuf" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/49/1d/f25d76d8260c156c40c97c9ed4511ec0f9ce353f8108ca6e7561f82a06b2/opentelemetry_proto-1.39.1.tar.gz", hash = "sha256:6c8e05144fc0d3ed4d22c2289c6b126e03bcd0e6a7da0f16cedd2e1c2772e2c8", size = 46152, upload-time = "2025-12-11T13:32:48.681Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/51/95/b40c96a7b5203005a0b03d8ce8cd212ff23f1793d5ba289c87a097571b18/opentelemetry_proto-1.39.1-py3-none-any.whl", hash = "sha256:22cdc78efd3b3765d09e68bfbd010d4fc254c9818afd0b6b423387d9dee46007", size = 72535, upload-time = "2025-12-11T13:32:33.866Z" }, +] + +[[package]] +name = "opentelemetry-sdk" +version = "1.39.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "opentelemetry-api" }, + { name = "opentelemetry-semantic-conventions" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/eb/fb/c76080c9ba07e1e8235d24cdcc4d125ef7aa3edf23eb4e497c2e50889adc/opentelemetry_sdk-1.39.1.tar.gz", hash = "sha256:cf4d4563caf7bff906c9f7967e2be22d0d6b349b908be0d90fb21c8e9c995cc6", size = 171460, upload-time = "2025-12-11T13:32:49.369Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7c/98/e91cf858f203d86f4eccdf763dcf01cf03f1dae80c3750f7e635bfa206b6/opentelemetry_sdk-1.39.1-py3-none-any.whl", hash = "sha256:4d5482c478513ecb0a5d938dcc61394e647066e0cc2676bee9f3af3f3f45f01c", size = 132565, upload-time = "2025-12-11T13:32:35.069Z" }, +] + +[[package]] +name = "opentelemetry-semantic-conventions" +version = "0.60b1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "opentelemetry-api" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/91/df/553f93ed38bf22f4b999d9be9c185adb558982214f33eae539d3b5cd0858/opentelemetry_semantic_conventions-0.60b1.tar.gz", hash = "sha256:87c228b5a0669b748c76d76df6c364c369c28f1c465e50f661e39737e84bc953", size = 137935, upload-time = "2025-12-11T13:32:50.487Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7a/5e/5958555e09635d09b75de3c4f8b9cae7335ca545d77392ffe7331534c402/opentelemetry_semantic_conventions-0.60b1-py3-none-any.whl", hash = "sha256:9fa8c8b0c110da289809292b0591220d3a7b53c1526a23021e977d68597893fb", size = 219982, upload-time = "2025-12-11T13:32:36.955Z" }, +] + [[package]] name = "packaging" version = "25.0" @@ -1779,8 +1894,8 @@ wheels = [ [[package]] name = "pipelex" -version = "0.15.7" -source = { registry = "https://pypi.org/simple" } +version = "0.17.3" +source = { editable = "../pipelex" } dependencies = [ { name = "aiofiles" }, { name = "backports-strenum", marker = "python_full_version < '3.11'" }, @@ -1794,8 +1909,13 @@ dependencies = [ { name = "networkx", version = "3.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "networkx", version = "3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, { name = "openai" }, + { name = "opentelemetry-api" }, + { name = "opentelemetry-exporter-otlp-proto-http" }, + { name = "opentelemetry-sdk" }, + { name = "opentelemetry-semantic-conventions" }, { name = "pillow" }, { name = "polyfactory" }, + { name = "portkey-ai" }, { name = "posthog" }, { name = "pydantic" }, { name = "pypdfium2" }, @@ -1807,11 +1927,6 @@ dependencies = [ { name = "tomlkit" }, { name = "typer" }, { name = "typing-extensions" }, - { name = "yattag" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/11/64/5fdd37e2cae3d5d4ade7fd510c41e5d335da8a2f5135c0f9e0e1bba37f6f/pipelex-0.15.7.tar.gz", hash = "sha256:5758f7c70d20efe4d2ac704be923a6da1744f170e29cf4816012c41bce637c97", size = 360004, upload-time = "2025-11-18T16:14:40.488Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/8c/c4/c7393a44a19167fd2016547454e5085e2885ff63491656d87f2572e86a8c/pipelex-0.15.7-py3-none-any.whl", hash = "sha256:3a4a81e1a0dc2b3186990214be6d75c5e8daef81e0c89087869af820639c4d0a", size = 563314, upload-time = "2025-11-18T16:14:38.78Z" }, ] [package.optional-dependencies] @@ -1830,6 +1945,69 @@ google-genai = [ { name = "instructor", extra = ["google-genai"] }, ] +[package.metadata] +requires-dist = [ + { name = "aioboto3", marker = "extra == 'bedrock'", specifier = ">=13.4.0" }, + { name = "aiofiles", specifier = ">=23.2.1" }, + { name = "anthropic", marker = "extra == 'anthropic'", specifier = ">=0.60.0" }, + { name = "backports-strenum", marker = "python_full_version < '3.11'", specifier = ">=1.3.0" }, + { name = "boto3", marker = "extra == 'bedrock'", specifier = ">=1.34.131" }, + { name = "boto3-stubs", marker = "extra == 'dev'", specifier = ">=1.35.24" }, + { name = "docling", marker = "extra == 'docling'", specifier = ">=2.64.0" }, + { name = "fal-client", marker = "extra == 'fal'", specifier = ">=0.4.1" }, + { name = "filetype", specifier = ">=1.2.0" }, + { name = "google-auth-oauthlib", marker = "extra == 'google'", specifier = ">=1.2.1" }, + { name = "google-genai", marker = "extra == 'google-genai'" }, + { name = "httpx", specifier = ">=0.23.0,<1.0.0" }, + { name = "instructor", specifier = ">=1.8.3,!=1.11.*,!=1.12.*" }, + { name = "instructor", extras = ["google-genai"], marker = "extra == 'google-genai'" }, + { name = "jinja2", specifier = ">=3.1.4" }, + { name = "json2html", specifier = ">=1.3.0" }, + { name = "kajson", specifier = "==0.3.1" }, + { name = "markdown", specifier = ">=3.6" }, + { name = "mistralai", marker = "extra == 'mistralai'", specifier = "==1.5.2" }, + { name = "mkdocs", marker = "extra == 'docs'", specifier = "==1.6.1" }, + { name = "mkdocs-glightbox", marker = "extra == 'docs'", specifier = "==0.4.0" }, + { name = "mkdocs-material", marker = "extra == 'docs'", specifier = "==9.6.14" }, + { name = "mkdocs-meta-manager", marker = "extra == 'docs'", specifier = "==1.1.0" }, + { name = "mypy", marker = "extra == 'dev'", specifier = ">=1.11.2" }, + { name = "networkx", specifier = ">=3.4.2" }, + { name = "openai", specifier = ">=1.108.1" }, + { name = "opentelemetry-api" }, + { name = "opentelemetry-exporter-otlp-proto-http" }, + { name = "opentelemetry-sdk" }, + { name = "opentelemetry-semantic-conventions" }, + { name = "pillow", specifier = ">=11.2.1" }, + { name = "polyfactory", specifier = ">=2.21.0" }, + { name = "portkey-ai", specifier = ">=2.1.0" }, + { name = "posthog", specifier = ">=6.7.0" }, + { name = "pydantic", specifier = ">=2.10.6,<3.0.0" }, + { name = "pylint", marker = "extra == 'dev'", specifier = ">=3.3.8" }, + { name = "pypdfium2", specifier = ">=4.30.0,!=4.30.1" }, + { name = "pyright", marker = "extra == 'dev'", specifier = ">=1.1.405" }, + { name = "pytest", marker = "extra == 'dev'", specifier = ">=9.0.1" }, + { name = "pytest-asyncio", marker = "extra == 'dev'", specifier = ">=0.24.0" }, + { name = "pytest-cov", marker = "extra == 'dev'", specifier = ">=6.1.1" }, + { name = "pytest-mock", marker = "extra == 'dev'", specifier = ">=3.14.0" }, + { name = "pytest-sugar", marker = "extra == 'dev'", specifier = ">=1.0.0" }, + { name = "pytest-xdist", marker = "extra == 'dev'", specifier = ">=3.6.1" }, + { name = "python-dotenv", specifier = ">=1.0.1" }, + { name = "pyyaml", specifier = ">=6.0.2" }, + { name = "rich", specifier = ">=13.8.1" }, + { name = "ruff", marker = "extra == 'dev'", specifier = ">=0.6.8" }, + { name = "shortuuid", specifier = ">=1.0.13" }, + { name = "tomli", specifier = ">=2.3.0" }, + { name = "tomlkit", specifier = ">=0.13.2" }, + { name = "typer", specifier = ">=0.16.0" }, + { name = "types-aioboto3", extras = ["bedrock", "bedrock-runtime"], marker = "extra == 'dev'", specifier = ">=13.4.0" }, + { name = "types-aiofiles", marker = "extra == 'dev'", specifier = ">=24.1.0.20240626" }, + { name = "types-markdown", marker = "extra == 'dev'", specifier = ">=3.6.0.20240316" }, + { name = "types-networkx", marker = "extra == 'dev'", specifier = ">=3.3.0.20241020" }, + { name = "types-pyyaml", marker = "extra == 'dev'", specifier = ">=6.0.12.20250326" }, + { name = "typing-extensions", specifier = ">=4.13.2" }, +] +provides-extras = ["anthropic", "bedrock", "docling", "fal", "google", "google-genai", "mistralai", "docs", "dev"] + [[package]] name = "platformdirs" version = "4.3.8" @@ -1861,6 +2039,27 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/12/8d/245e02e6ff1f046f70636dc62380ea73b03ab1d7f1fdcf61cbe26bc9c030/polyfactory-2.22.1-py3-none-any.whl", hash = "sha256:7500ee3678d9bc25347c0a73a35d3711cfcf9c7f45ad56d0bb085e9f75ecae7a", size = 63547, upload-time = "2025-07-14T19:37:27.353Z" }, ] +[[package]] +name = "portkey-ai" +version = "2.1.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "anyio" }, + { name = "cached-property" }, + { name = "distro" }, + { name = "httpx" }, + { name = "jiter" }, + { name = "pydantic" }, + { name = "sniffio" }, + { name = "tqdm" }, + { name = "types-requests" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/d4/8a/f5bbaab806ad61d9959cb7c88c639200feacac1b2ba7b455b97a2f216e7c/portkey_ai-2.1.0.tar.gz", hash = "sha256:c2558041c568eef8528737978089301cb9be056f166a683251831cbfa6a623cb", size = 567417, upload-time = "2025-11-25T20:32:43.102Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b7/11/c585b90ac842027e5f4f7f7cee72d3197f58ff24b6d7c5f1243aa8fa96be/portkey_ai-2.1.0-py3-none-any.whl", hash = "sha256:2166033f8e198745947fee5321d0bbcfb005afc35468bd5a948fa83dc16b6767", size = 1181622, upload-time = "2025-11-25T20:32:41.185Z" }, +] + [[package]] name = "posthog" version = "6.7.10" @@ -1967,6 +2166,21 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/cc/35/cc0aaecf278bb4575b8555f2b137de5ab821595ddae9da9d3cd1da4072c7/propcache-0.3.2-py3-none-any.whl", hash = "sha256:98f1ec44fb675f5052cccc8e609c46ed23a35a1cfd18545ad4e29002d858a43f", size = 12663, upload-time = "2025-06-09T22:56:04.484Z" }, ] +[[package]] +name = "protobuf" +version = "6.33.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/34/44/e49ecff446afeec9d1a66d6bbf9adc21e3c7cea7803a920ca3773379d4f6/protobuf-6.33.2.tar.gz", hash = "sha256:56dc370c91fbb8ac85bc13582c9e373569668a290aa2e66a590c2a0d35ddb9e4", size = 444296, upload-time = "2025-12-06T00:17:53.311Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/bc/91/1e3a34881a88697a7354ffd177e8746e97a722e5e8db101544b47e84afb1/protobuf-6.33.2-cp310-abi3-win32.whl", hash = "sha256:87eb388bd2d0f78febd8f4c8779c79247b26a5befad525008e49a6955787ff3d", size = 425603, upload-time = "2025-12-06T00:17:41.114Z" }, + { url = "https://files.pythonhosted.org/packages/64/20/4d50191997e917ae13ad0a235c8b42d8c1ab9c3e6fd455ca16d416944355/protobuf-6.33.2-cp310-abi3-win_amd64.whl", hash = "sha256:fc2a0e8b05b180e5fc0dd1559fe8ebdae21a27e81ac77728fb6c42b12c7419b4", size = 436930, upload-time = "2025-12-06T00:17:43.278Z" }, + { url = "https://files.pythonhosted.org/packages/b2/ca/7e485da88ba45c920fb3f50ae78de29ab925d9e54ef0de678306abfbb497/protobuf-6.33.2-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:d9b19771ca75935b3a4422957bc518b0cecb978b31d1dd12037b088f6bcc0e43", size = 427621, upload-time = "2025-12-06T00:17:44.445Z" }, + { url = "https://files.pythonhosted.org/packages/7d/4f/f743761e41d3b2b2566748eb76bbff2b43e14d5fcab694f494a16458b05f/protobuf-6.33.2-cp39-abi3-manylinux2014_aarch64.whl", hash = "sha256:b5d3b5625192214066d99b2b605f5783483575656784de223f00a8d00754fc0e", size = 324460, upload-time = "2025-12-06T00:17:45.678Z" }, + { url = "https://files.pythonhosted.org/packages/b1/fa/26468d00a92824020f6f2090d827078c09c9c587e34cbfd2d0c7911221f8/protobuf-6.33.2-cp39-abi3-manylinux2014_s390x.whl", hash = "sha256:8cd7640aee0b7828b6d03ae518b5b4806fdfc1afe8de82f79c3454f8aef29872", size = 339168, upload-time = "2025-12-06T00:17:46.813Z" }, + { url = "https://files.pythonhosted.org/packages/56/13/333b8f421738f149d4fe5e49553bc2a2ab75235486259f689b4b91f96cec/protobuf-6.33.2-cp39-abi3-manylinux2014_x86_64.whl", hash = "sha256:1f8017c48c07ec5859106533b682260ba3d7c5567b1ca1f24297ce03384d1b4f", size = 323270, upload-time = "2025-12-06T00:17:48.253Z" }, + { url = "https://files.pythonhosted.org/packages/0e/15/4f02896cc3df04fc465010a4c6a0cd89810f54617a32a70ef531ed75d61c/protobuf-6.33.2-py3-none-any.whl", hash = "sha256:7636aad9bb01768870266de5dc009de2d1b936771b38a793f73cbbf279c91c5c", size = 170501, upload-time = "2025-12-06T00:17:52.211Z" }, +] + [[package]] name = "pyasn1" version = "0.6.1" @@ -2706,6 +2920,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/99/5f/e0af6f7f6a260d9af67e1db4f54d732abad514252a7a378a6c4d17dd1036/types_pyyaml-6.0.12.20250516-py3-none-any.whl", hash = "sha256:8478208feaeb53a34cb5d970c56a7cd76b72659442e733e268a94dc72b2d0530", size = 20312, upload-time = "2025-05-16T03:08:04.019Z" }, ] +[[package]] +name = "types-requests" +version = "2.32.4.20250913" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "urllib3" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/36/27/489922f4505975b11de2b5ad07b4fe1dca0bca9be81a703f26c5f3acfce5/types_requests-2.32.4.20250913.tar.gz", hash = "sha256:abd6d4f9ce3a9383f269775a9835a4c24e5cd6b9f647d64f88aa4613c33def5d", size = 23113, upload-time = "2025-09-13T02:40:02.309Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2a/20/9a227ea57c1285986c4cf78400d0a91615d25b24e257fd9e2969606bdfae/types_requests-2.32.4.20250913-py3-none-any.whl", hash = "sha256:78c9c1fffebbe0fa487a418e0fa5252017e9c60d1a2da394077f1780f655d7e1", size = 20658, upload-time = "2025-09-13T02:40:01.115Z" }, +] + [[package]] name = "types-s3transfer" version = "0.13.0" @@ -2997,7 +3223,10 @@ wheels = [ ] [[package]] -name = "yattag" -version = "1.16.1" +name = "zipp" +version = "3.23.0" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/1c/1a/d3b2a2b8f843f5e7138471c4a5c9172ef62bb41239aa4371784b7448110c/yattag-1.16.1.tar.gz", hash = "sha256:baa8f254e7ea5d3e0618281ad2ff5610e0e5360b3608e695c29bfb3b29d051f4", size = 29069, upload-time = "2024-11-02T22:38:30.443Z" } +sdist = { url = "https://files.pythonhosted.org/packages/e3/02/0f2892c661036d50ede074e376733dca2ae7c6eb617489437771209d4180/zipp-3.23.0.tar.gz", hash = "sha256:a07157588a12518c9d4034df3fbbee09c814741a33ff63c05fa29d26a2404166", size = 25547, upload-time = "2025-06-08T17:06:39.4Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2e/54/647ade08bf0db230bfea292f893923872fd20be6ac6f53b2b936ba839d75/zipp-3.23.0-py3-none-any.whl", hash = "sha256:071652d6115ed432f5ce1d34c336c0adfd6a884660d1e9712a256d3d3bd4b14e", size = 10276, upload-time = "2025-06-08T17:06:38.034Z" }, +] From 716bfd510a6d0ea4769e9f4f8117331595c3faf9 Mon Sep 17 00:00:00 2001 From: Louis Choquel <8851983+lchoquel@users.noreply.github.com> Date: Sun, 14 Dec 2025 22:21:26 +0100 Subject: [PATCH 2/8] PLX fixes related to new checks --- cocode/pipelines/doc_proofread/doc_proofread.plx | 2 +- cocode/pipelines/swe_diff/changelog_enhanced.plx | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cocode/pipelines/doc_proofread/doc_proofread.plx b/cocode/pipelines/doc_proofread/doc_proofread.plx index 8d65674..8899802 100644 --- a/cocode/pipelines/doc_proofread/doc_proofread.plx +++ b/cocode/pipelines/doc_proofread/doc_proofread.plx @@ -107,7 +107,7 @@ Make it concise and focused on the most critical issues only. [pipe.doc_proofread] type = "PipeSequence" description = "Complete documentation proofreading pipeline for CLI usage" -inputs = { repo_map = "RepositoryMap", doc_files = "DocumentationFile" } +inputs = { repo_map = "RepositoryMap", doc_files = "DocumentationFile[]" } output = "MarkdownReport" steps = [ { pipe = "proofread_doc_sequence", batch_over = "doc_files", batch_as = "doc_file", result = "all_inconsistencies" }, diff --git a/cocode/pipelines/swe_diff/changelog_enhanced.plx b/cocode/pipelines/swe_diff/changelog_enhanced.plx index 3b4c8c4..c6f46e9 100644 --- a/cocode/pipelines/swe_diff/changelog_enhanced.plx +++ b/cocode/pipelines/swe_diff/changelog_enhanced.plx @@ -1,5 +1,5 @@ -domain = "changelog" -description = "Pipelines for analyzing differences between two versions of a codebase." +domain = "changelog_enhanced" +description = "Pipelines for analyzing differences between two versions of a codebase — enhanced version." [concept] DraftChangelog = "A draft changelog with sections for each type of change." From 9a13f8c337c856b65d61e6b6d2a1526db2b0a2b9 Mon Sep 17 00:00:00 2001 From: Louis Choquel <8851983+lchoquel@users.noreply.github.com> Date: Sun, 11 Jan 2026 13:29:39 +0100 Subject: [PATCH 3/8] Update for Pipelex --- .pipelex/inference/backends.toml | 5 + .pipelex/inference/backends/azure_openai.toml | 63 +++++++++++- .pipelex/inference/backends/fal.toml | 7 +- .pipelex/inference/backends/google.toml | 7 ++ .pipelex/inference/backends/huggingface.toml | 42 ++++++++ .pipelex/inference/backends/mistral.toml | 26 ++++- .pipelex/inference/backends/openai.toml | 75 ++++++++++---- .pipelex/inference/deck/base_deck.toml | 10 +- .pipelex/inference/routing_profiles.toml | 89 +++++++++++------ .pipelex/pipelex.toml | 98 ++++++++++++------- .pipelex/telemetry.toml | 16 ++- .../ai_instructions/ai_instructions_cli.py | 3 - cocode/cli/analyze/analyze_cli.py | 2 - cocode/cli/changelog/changelog_cli.py | 2 - cocode/cli/doc/doc_cli.py | 5 - .../pipelines/doc_proofread/doc_proofread.plx | 24 ++--- cocode/swe/swe_cmd.py | 4 +- pyproject.toml | 1 + uv.lock | 3 +- 19 files changed, 360 insertions(+), 122 deletions(-) create mode 100644 .pipelex/inference/backends/huggingface.toml diff --git a/.pipelex/inference/backends.toml b/.pipelex/inference/backends.toml index a243cb1..46cbb79 100644 --- a/.pipelex/inference/backends.toml +++ b/.pipelex/inference/backends.toml @@ -54,6 +54,11 @@ enabled = false endpoint = "https://api.groq.com/openai/v1" api_key = "${GROQ_API_KEY}" +[huggingface] +display_name = "Hugging Face" +enabled = false +api_key = "${HF_TOKEN}" + [mistral] display_name = "Mistral AI" enabled = false diff --git a/.pipelex/inference/backends/azure_openai.toml b/.pipelex/inference/backends/azure_openai.toml index 35ccb48..51464f2 100644 --- a/.pipelex/inference/backends/azure_openai.toml +++ b/.pipelex/inference/backends/azure_openai.toml @@ -135,6 +135,27 @@ outputs = ["text", "structured"] costs = { input = 1.25, output = 10.0 } valued_constraints = { fixed_temperature = 1 } +["gpt-5.1-codex"] +model_id = "gpt-5.1-codex-2025-11-13" +inputs = ["text", "images"] +outputs = ["text", "structured"] +costs = { input = 1.25, output = 10.0 } +valued_constraints = { fixed_temperature = 1 } + +# --- GPT-5.2 Series ------------------------------------------------------------- +["gpt-5.2"] +model_id = "gpt-5.2-2025-12-11" +inputs = ["text", "images"] +outputs = ["text", "structured"] +costs = { input = 1.75, output = 14.0 } + +["gpt-5.2-chat"] +model_id = "gpt-5.2-chat-2025-12-11" +inputs = ["text", "images"] +outputs = ["text", "structured"] +costs = { input = 1.25, output = 10.0 } +valued_constraints = { fixed_temperature = 1 } + ################################################################################ # IMAGE GENERATION MODELS ################################################################################ @@ -146,4 +167,44 @@ model_type = "img_gen" model_id = "gpt-image-1-2025-04-15" inputs = ["text"] outputs = ["image"] -costs = { input = 0.04, output = 0.0 } +costs = { input = 10, output = 40 } + +[gpt-image-1.rules] +num_images = "gpt" +aspect_ratio = "gpt" +background = "gpt" +inference = "gpt" +safety_checker = "unavailable" +output_format = "gpt" + +[gpt-image-1-mini] +sdk = "azure_rest_img_gen" +model_type = "img_gen" +model_id = "gpt-image-1-mini-2025-10-06" +inputs = ["text"] +outputs = ["image"] +costs = { input = 2.5, output = 8 } + +[gpt-image-1-mini.rules] +num_images = "gpt" +aspect_ratio = "gpt" +background = "gpt" +inference = "gpt" +safety_checker = "unavailable" +output_format = "gpt" + +["gpt-image-1.5"] +sdk = "azure_rest_img_gen" +model_type = "img_gen" +model_id = "gpt-image-1.5-2025-12-16" +inputs = ["text"] +outputs = ["image"] +costs = { input = 8, output = 32 } + +["gpt-image-1.5".rules] +num_images = "gpt" +aspect_ratio = "gpt" +background = "gpt" +inference = "gpt" +safety_checker = "unavailable" +output_format = "gpt" diff --git a/.pipelex/inference/backends/fal.toml b/.pipelex/inference/backends/fal.toml index 608bfed..42e2f77 100644 --- a/.pipelex/inference/backends/fal.toml +++ b/.pipelex/inference/backends/fal.toml @@ -41,6 +41,7 @@ num_images = "fal" aspect_ratio = "flux" inference = "flux" safety_checker = "available" +output_format = "flux_1" specific = "fal" ["flux-pro/v1.1"] @@ -54,6 +55,7 @@ num_images = "fal" aspect_ratio = "flux" inference = "flux" safety_checker = "available" +output_format = "flux_1" specific = "fal" ["flux-pro/v1.1-ultra"] @@ -67,6 +69,7 @@ num_images = "fal" aspect_ratio = "flux_11_ultra" inference = "flux_11_ultra" safety_checker = "available" +output_format = "flux_1" specific = "fal" [flux-2] @@ -80,6 +83,7 @@ num_images = "fal" aspect_ratio = "flux" inference = "flux" safety_checker = "available" +output_format = "flux_2" specific = "fal" # --- SDXL models -------------------------------------------------------------- @@ -91,7 +95,8 @@ costs = { input = 0.0003, output = 0.0 } [fast-lightning-sdxl.rules] num_images = "fal" -specific = "fal" aspect_ratio = "flux" inference = "sdxl_lightning" safety_checker = "unavailable" +output_format = "sdxl" +specific = "fal" diff --git a/.pipelex/inference/backends/google.toml b/.pipelex/inference/backends/google.toml index e0d3f0a..36e19c2 100644 --- a/.pipelex/inference/backends/google.toml +++ b/.pipelex/inference/backends/google.toml @@ -68,6 +68,13 @@ outputs = ["text", "structured"] max_prompt_images = 3000 costs = { input = 2, output = 12.0 } +["gemini-3.0-flash-preview"] +model_id = "gemini-3-flash-preview" +inputs = ["text", "images"] +outputs = ["text", "structured"] +max_prompt_images = 3000 +costs = { input = 0.5, output = 3.0 } + ################################################################################ # IMAGE GENERATION MODELS (Nano Banana) ################################################################################ diff --git a/.pipelex/inference/backends/huggingface.toml b/.pipelex/inference/backends/huggingface.toml new file mode 100644 index 0000000..8916f9b --- /dev/null +++ b/.pipelex/inference/backends/huggingface.toml @@ -0,0 +1,42 @@ +################################################################################ +# Hugging Face Backend Configuration +################################################################################ +# +# This file defines the model specifications for Hugging Face models. +# It contains model definitions for various image generation models +# accessible through the Hugging Face Inference API with provider="auto". +# +# Configuration structure: +# - Each model is defined in its own section with the model name as the header +# - Headers with dots or slashes must be quoted (e.g., ["stabilityai/stable-diffusion-2-1"]) +# - Model costs are in USD per million tokens (input/output) +# +# Documentation: https://docs.pipelex.com +# Support: https://go.pipelex.com/discord +# +################################################################################ + +################################################################################ +# MODEL DEFAULTS +################################################################################ + +[defaults] +model_type = "img_gen" +sdk = "huggingface_img_gen" + +################################################################################ +# IMAGE GENERATION MODELS +################################################################################ + +# --- Qwen Image Models -------------------------------------------------- +[qwen-image] +model_id = "Qwen/Qwen-Image" +inputs = ["text"] +outputs = ["image"] +costs = { input = 0.0, output = 0.0 } +variant = "fal-ai" +# variant = "replicate" + +[qwen-image.rules] +aspect_ratio = "qwen_image" +inference = "qwen_image" diff --git a/.pipelex/inference/backends/mistral.toml b/.pipelex/inference/backends/mistral.toml index c4b49e3..c8c4b05 100644 --- a/.pipelex/inference/backends/mistral.toml +++ b/.pipelex/inference/backends/mistral.toml @@ -133,11 +133,33 @@ costs = { input = 0.4, output = 2.0 } # EXTRACTION MODELS ################################################################################ +# TODO: add support to pricing per page + # --- OCR Models --------------------------------------------------------------- +[mistral-ocr-2503] +model_type = "text_extractor" +model_id = "mistral-ocr-2503" +max_tokens = 16384 +inputs = ["pdf", "image"] +outputs = ["pages"] + +[mistral-ocr-2505] +model_type = "text_extractor" +model_id = "mistral-ocr-2505" +max_tokens = 16384 +inputs = ["pdf", "image"] +outputs = ["pages"] + +[mistral-ocr-2512] +model_type = "text_extractor" +model_id = "mistral-ocr-2512" +max_tokens = 16384 +inputs = ["pdf", "image"] +outputs = ["pages"] + [mistral-ocr] model_type = "text_extractor" model_id = "mistral-ocr-latest" -max_tokens = 131072 +max_tokens = 16384 inputs = ["pdf", "image"] outputs = ["pages"] -costs = { input = 0.4, output = 2.0 } diff --git a/.pipelex/inference/backends/openai.toml b/.pipelex/inference/backends/openai.toml index bfb3dd9..e61d52e 100644 --- a/.pipelex/inference/backends/openai.toml +++ b/.pipelex/inference/backends/openai.toml @@ -39,83 +39,72 @@ costs = { input = 0.5, output = 1.5 } # --- GPT-4 Series ------------------------------------------------------------- [gpt-4] -model_id = "gpt-4" +inputs = ["text"] +outputs = ["text"] costs = { input = 30.0, output = 60.0 } [gpt-4-turbo] -model_id = "gpt-4-turbo" inputs = ["text"] outputs = ["text", "structured"] costs = { input = 10.0, output = 30.0 } # --- GPT-4o Series ------------------------------------------------------------ [gpt-4o-2024-11-20] -model_id = "gpt-4o-2024-11-20" inputs = ["text", "images"] outputs = ["text", "structured"] costs = { input = 2.5, output = 10.0 } [gpt-4o] -model_id = "gpt-4o" inputs = ["text", "images"] outputs = ["text", "structured"] costs = { input = 2.5, output = 10.0 } [gpt-4o-mini-2024-07-18] -model_id = "gpt-4o-mini-2024-07-18" inputs = ["text", "images"] outputs = ["text", "structured"] costs = { input = 0.15, output = 0.6 } [gpt-4o-mini] -model_id = "gpt-4o-mini" inputs = ["text", "images"] outputs = ["text", "structured"] costs = { input = 0.15, output = 0.6 } # --- GPT-4.1 Series ----------------------------------------------------------- ["gpt-4.1"] -model_id = "gpt-4.1" inputs = ["text", "images"] outputs = ["text", "structured"] costs = { input = 2, output = 8 } ["gpt-4.1-mini"] -model_id = "gpt-4.1-mini" inputs = ["text", "images"] outputs = ["text", "structured"] costs = { input = 0.4, output = 1.6 } ["gpt-4.1-nano"] -model_id = "gpt-4.1-nano" inputs = ["text", "images"] outputs = ["text", "structured"] costs = { input = 0.1, output = 0.4 } # --- o Series ---------------------------------------------------------------- [o1] -model_id = "o1" inputs = ["text", "images"] outputs = ["text", "structured"] costs = { input = 15.0, output = 60.0 } valued_constraints = { fixed_temperature = 1 } [o3-mini] -model_id = "o3-mini" inputs = ["text"] outputs = ["text", "structured"] costs = { input = 1.1, output = 4.4 } valued_constraints = { fixed_temperature = 1 } [o3] -model_id = "o3" inputs = ["text", "images"] outputs = ["text", "structured"] costs = { input = 10.0, output = 40.0 } valued_constraints = { fixed_temperature = 1 } [o4-mini] -model_id = "o4-mini" inputs = ["text"] outputs = ["text", "structured"] costs = { input = 1.1, output = 4.4 } @@ -123,21 +112,18 @@ valued_constraints = { fixed_temperature = 1 } # --- GPT-5 Series ------------------------------------------------------------- [gpt-5] -model_id = "gpt-5" inputs = ["text", "images"] outputs = ["text", "structured"] costs = { input = 1.25, output = 10.0 } valued_constraints = { fixed_temperature = 1 } [gpt-5-mini] -model_id = "gpt-5-mini" inputs = ["text", "images"] outputs = ["text", "structured"] costs = { input = 0.25, output = 2.0 } valued_constraints = { fixed_temperature = 1 } [gpt-5-nano] -model_id = "gpt-5-nano" inputs = ["text", "images"] outputs = ["text", "structured"] costs = { input = 0.05, output = 0.4 } @@ -150,13 +136,50 @@ outputs = ["text"] costs = { input = 1.25, output = 10.0 } valued_constraints = { fixed_temperature = 1 } +[gpt-5-codex] +inputs = ["text", "images"] +outputs = ["text", "structured"] +costs = { input = 1.25, output = 10.0 } +valued_constraints = { fixed_temperature = 1 } + # --- GPT-5.1 Series ------------------------------------------------------------- ["gpt-5.1"] -model_id = "gpt-5.1" inputs = ["text", "images"] outputs = ["text", "structured"] costs = { input = 1.25, output = 10.0 } +["gpt-5.1-chat"] +model_id = "gpt-5.1-chat-latest" +inputs = ["text", "images"] +outputs = ["text", "structured"] +costs = { input = 1.25, output = 10.0 } +valued_constraints = { fixed_temperature = 1 } + +["gpt-5.1-codex"] +inputs = ["text", "images"] +outputs = ["text", "structured"] +costs = { input = 1.25, output = 10.0 } +valued_constraints = { fixed_temperature = 1 } + +["gpt-5.1-codex-max"] +inputs = ["text", "images"] +outputs = ["text", "structured"] +costs = { input = 1.25, output = 10.0 } +valued_constraints = { fixed_temperature = 1 } + +# --- GPT-5.2 Series ------------------------------------------------------------- +["gpt-5.2"] +inputs = ["text", "images"] +outputs = ["text", "structured"] +costs = { input = 1.75, output = 14.0 } + +["gpt-5.2-chat"] +model_id = "gpt-5.2-chat-latest" +inputs = ["text", "images"] +outputs = ["text", "structured"] +costs = { input = 1.75, output = 14.0 } +valued_constraints = { fixed_temperature = 1 } + ################################################################################ # IMAGE GENERATION MODELS ################################################################################ @@ -165,7 +188,21 @@ costs = { input = 1.25, output = 10.0 } [gpt-image-1] sdk = "openai_img_gen" model_type = "img_gen" -model_id = "gpt-image-1" inputs = ["text"] outputs = ["image"] -costs = { input = 0.04, output = 0.0 } +costs = { input = 10, output = 40 } + +[gpt-image-1-mini] +sdk = "openai_img_gen" +model_type = "img_gen" +inputs = ["text"] +outputs = ["image"] +costs = { input = 2.5, output = 8 } + +["gpt-image-1.5"] +sdk = "openai_img_gen" +model_type = "img_gen" +model_id = "gpt-image-1.5" +inputs = ["text"] +outputs = ["image"] +costs = { input = 8, output = 32 } diff --git a/.pipelex/inference/deck/base_deck.toml b/.pipelex/inference/deck/base_deck.toml index d7dba4d..bb28c3a 100644 --- a/.pipelex/inference/deck/base_deck.toml +++ b/.pipelex/inference/deck/base_deck.toml @@ -108,7 +108,11 @@ cheap_llm_for_creativity = [ ] # --- Waterfalls for Extracts --------------------------------------------------------------------- -pdf_text_extractor = ["mistral-ocr", "pypdfium2-extract-text"] +pdf_text_extractor = [ + "azure-document-intelligence", + "mistral-ocr", + "pypdfium2-extract-text", +] image_text_extractor = ["mistral-ocr"] #################################################################################################### @@ -172,7 +176,7 @@ for_object = "cheap_llm_for_structured" choice_default = "extract_text_from_visuals" [extract.presets] -extract_text_from_visuals = { model = "mistral-ocr", max_nb_images = 100, image_min_size = 50 } +extract_text_from_visuals = { model = "azure-document-intelligence", max_nb_images = 100, image_min_size = 50 } extract_text_from_pdf = { model = "pypdfium2-extract-text", max_nb_images = 100, image_min_size = 50 } #################################################################################################### @@ -196,5 +200,3 @@ img_gen_for_diagram = { model = "base-img-gen", quality = "medium", guidance_sca img_gen_for_mockup = { model = "base-img-gen", quality = "medium", guidance_scale = 6.5, is_moderated = true, safety_tolerance = 3 } img_gen_for_product = { model = "best-img-gen", quality = "high", guidance_scale = 8.5, is_moderated = true, safety_tolerance = 2 } img_gen_for_testing = { model = "fast-img-gen", nb_steps = 4, guidance_scale = 4.0, is_moderated = true, safety_tolerance = 4 } - -# img_gen_for_testing = { model = "nano-banana" } diff --git a/.pipelex/inference/routing_profiles.toml b/.pipelex/inference/routing_profiles.toml index 76891b0..d27e659 100644 --- a/.pipelex/inference/routing_profiles.toml +++ b/.pipelex/inference/routing_profiles.toml @@ -9,22 +9,54 @@ # ========================================================================================= # Which profile to use (change this to switch routing) -active = "pipelex_gateway_first" +# TODO: TBD +active = "pipelex_gateway_first" # Change to "pipelex_gateway_first" after enabling gateway -# We recommend using the "pipelex_first" profile to get a head start with all models. -# The Pipelex Inference backend is currently not recommended for production use, -# but it's great for development and testing. -# To use the Pipelex Inference backend (pipelex_first profile): +# We recommend using the "pipelex_gateway_first" profile to get a head start with all models. +# To use the Pipelex Gateway backend: # 1. Join our Discord community to get your free API key (no credit card required): # Visit https://go.pipelex.com/discord and request your key in the appropriate channel -# 2. Set the environment variable: export PIPELEX_INFERENCE_API_KEY="your-api-key" -# 3. The .pipelex/inference/backends.toml is already configured with api_key = "${PIPELEX_INFERENCE_API_KEY}" -# which will get the key from the environment variable. +# 2. Set the environment variable (or add it to your .env file): +# - Linux/macOS: export PIPELEX_GATEWAY_API_KEY="your-api-key" +# - Windows CMD: set PIPELEX_GATEWAY_API_KEY=your-api-key +# - Windows PowerShell: $env:PIPELEX_GATEWAY_API_KEY="your-api-key" +# 3. The .pipelex/inference/backends.toml is already configured with api_key = "${PIPELEX_GATEWAY_API_KEY}" +# which will get the key from the environment variable. # ========================================================================================= # Routing Profiles # ========================================================================================= +[profiles.pipelex_gateway_first] +description = "Use Pipelex Gateway backend for all its supported models" +default = "pipelex_gateway" +fallback_order = [ + "pipelex_gateway", + "azure_openai", + "bedrock", + "google", + "blackboxai", + "mistral", + "fal", +] + +[profiles.pipelex_gateway_first.routes] +# Pattern matching: "model-pattern" = "backend-name" + +[profiles.pipelex_gateway_first.optional_routes] # Each optional route is considered only if its backend is available +"gpt-*" = "pipelex_gateway" +"gpt-image-1" = "openai" +"claude-*" = "pipelex_gateway" +"grok-*" = "pipelex_gateway" +"gemini-*" = "pipelex_gateway" +"*-sdxl" = "fal" +"flux-*" = "fal" +"mistral-ocr" = "mistral" + +[profiles.all_pipelex_gateway] +description = "Use Pipelex Gateway for all its supported models" +default = "pipelex_gateway" + [profiles.all_anthropic] description = "Use Anthropic backend for all its supported models" default = "anthropic" @@ -49,6 +81,14 @@ default = "fal" description = "Use Google GenAI backend for all its supported models" default = "google" +[profiles.all_groq] +description = "Use groq backend for all its supported models" +default = "groq" + +[profiles.all_huggingface] +description = "Use HuggingFace backend for all its supported models" +default = "huggingface" + [profiles.all_mistral] description = "Use Mistral backend for all its supported models" default = "mistral" @@ -61,14 +101,22 @@ default = "ollama" description = "Use OpenAI backend for all its supported models" default = "openai" -[profiles.all_vertexai] -description = "Use Vertex AI backend for all its supported models" -default = "vertexai" +[profiles.all_portkey] +description = "Use Portkey backend for all its supported models" +default = "portkey" + +[profiles.all_scaleway] +description = "Use Scaleway backend for all its supported models" +default = "scaleway" [profiles.all_xai] description = "Use xAI backend for all its supported models" default = "xai" +[profiles.all_internal] +description = "Use internal backend for all its supported models" +default = "internal" + # ========================================================================================= # Custom Profiles # ========================================================================================= @@ -124,22 +172,3 @@ description = "Example routing profile using specific models" "grok-3" = "pipelex_inference" "grok-3-mini" = "xai" - -[profiles.pipelex_gateway_first] -description = "Use Pipelex Inference backend for all its supported models" -default = "pipelex_gateway" -fallback_order = ["pipelex_gateway", "azure_openai", "bedrock", "blackboxai", "mistral", "fal"] - -[profiles.pipelex_gateway_first.routes] -# Pattern matching: "model-pattern" = "backend-name" -"gpt-*" = "pipelex_gateway" -"claude-*" = "pipelex_gateway" -"grok-*" = "pipelex_gateway" -"gemini-*" = "pipelex_gateway" - -[profiles.pipelex_gateway_first.optional_routes] # Each optional route is considered only if its backend is available -"*-sdxl" = "fal" -"flux-*" = "fal" -"gpt-image-1" = "openai" -"mistral-ocr" = "mistral" - diff --git a/.pipelex/pipelex.toml b/.pipelex/pipelex.toml index a1faa16..2983ada 100644 --- a/.pipelex/pipelex.toml +++ b/.pipelex/pipelex.toml @@ -2,11 +2,11 @@ # Pipelex Configuration File #################################################################################################### # -# This configuration file is copied to your project's .pipelex/ directory when you run: -# pipelex init config +# This configuration file is copied to client projects' .pipelex/ directory when running: +# `pipelex init config` # # Purpose: -# - This file allows you to override Pipelex's default settings for your specific project +# - This file allows to override Pipelex's default settings for specific projects # - Feel free to modify any settings below to suit your needs # - You can add any configuration sections that exist in the main pipelex.toml # @@ -14,7 +14,7 @@ # - See the full default configuration in: pipelex/pipelex.toml (in the Pipelex package) # - See the configuration structure classes in: pipelex/config.py and pipelex/cogt/config_cogt.py # -# Common Customizations: +# Common customizations are proposed below, such as: # - Logging levels and behavior # - Excluded directories for scanning # - LLM prompt dumping for debugging @@ -26,59 +26,89 @@ # #################################################################################################### + +[pipelex.storage_config] +# Uncomment to change the storage method: "local" (default) or "in_memory" +# is_fetch_remote_content_enabled = true +# uri_format = "{primary_id}/{secondary_id}/{hash}.{extension}" +# method = "local" +# local_storage_path = ".pipelex/storage" + [pipelex.scan_config] -excluded_dirs = [ - ".venv", - "venv", - "env", - ".env", - "virtualenv", - ".virtualenv", - ".git", - "__pycache__", - ".pytest_cache", - ".mypy_cache", - ".ruff_cache", - "node_modules", - "results", -] +# Uncomment to customize the excluded directories for scanning +# excluded_dirs = [ +# ".venv", +# "venv", +# "env", +# ".env", +# "virtualenv", +# ".virtualenv", +# ".git", +# "__pycache__", +# ".pytest_cache", +# ".mypy_cache", +# ".ruff_cache", +# "node_modules", +# "results", +# ] + +[pipelex.builder_config] +# Uncomment to change where the generated pipelines are saved: +# default_output_dir = "." +# default_bundle_file_name = "bundle" +# default_directory_base_name = "pipeline" [pipelex.log_config] -default_log_level = "INFO" +# Uncomment to change the default log level: +# default_log_level = "INFO" + +# Uncomment to log to stderr instead of stdout +# console_log_target = "stderr" +# console_print_target = "stderr" [pipelex.log_config.package_log_levels] -pipelex = "INFO" +# Uncomment to change the log level for specific packages: +# pipelex = "INFO" [cogt] [cogt.model_deck_config] -# Uncomment to disable model fallback: it will raise errors instead of using secondary model options +# Uncomment to disable model fallback: it will raise errors instead of using secondary model options: # is_model_fallback_enabled = false # Uncomment to change the reaction to missing presets: "raise" (default), "log" or "none" # missing_presets_reaction = "raise" +[cogt.tenacity_config] +# Uncomment to change those values as needed: +# max_retries = 50 # Maximum number of retry attempts before giving up +# wait_multiplier = 0.2 # Multiplier applied to the wait time between retries (in seconds) +# wait_max = 20 # Maximum wait time between retries (in seconds) +# wait_exp_base = 1.3 # Base for exponential backoff calculation + [cogt.llm_config] -# Uncomment any of these to enable dumping the inputs or outputs of text-genration with an LLM +# Uncomment any of these to enable dumping the inputs or outputs of text-generation with an LLM: # is_dump_text_prompts_enabled = true # is_dump_response_text_enabled = true [cogt.llm_config.instructor_config] -# Uncomment any of these to enable dumping the kwargs, response or errors of the instructor +# Uncomment any of these to enable dumping the kwargs, response or errors when generating structured content: # is_dump_kwargs_enabled = true # is_dump_response_enabled = true # is_dump_error_enabled = true [pipelex.observer_config] -observer_dir = "results/observer" +# Uncomment to change the directory where the observer will save its results: +# observer_dir = "results/observer" [pipelex.feature_config] -# WIP/Experimental feature flags -is_pipeline_tracking_enabled = false -is_reporting_enabled = true +# WIP/Experimental feature flags: +# is_pipeline_tracking_enabled = false +# is_reporting_enabled = true [pipelex.reporting_config] -is_log_costs_to_console = false -is_generate_cost_report_file_enabled = true -cost_report_dir_path = "reports" -cost_report_base_name = "cost_report" -cost_report_extension = "csv" -cost_report_unit_scale = 1.0 +# Uncomment to customize the reporting configuration: +# is_log_costs_to_console = false +# is_generate_cost_report_file_enabled = false +# cost_report_dir_path = "reports" +# cost_report_base_name = "cost_report" +# cost_report_extension = "csv" +# cost_report_unit_scale = 1.0 diff --git a/.pipelex/telemetry.toml b/.pipelex/telemetry.toml index 75207a7..eb2c537 100644 --- a/.pipelex/telemetry.toml +++ b/.pipelex/telemetry.toml @@ -21,7 +21,7 @@ # PostHog Configuration (Event tracking + AI span tracing) # ────────────────────────────────────────────────────────────────────────────── -[posthog] +[custom_posthog] mode = "off" # Values: "off" | "anonymous" | "identified" # user_id = "your_user_id" # Required when mode = "identified" endpoint = "${POSTHOG_ENDPOINT}" # Default: https://us.i.posthog.com (or https://eu.i.posthog.com for EU) @@ -37,16 +37,24 @@ redact_properties = [ ] # Event properties to redact # AI span tracing to YOUR PostHog (does NOT affect Langfuse/OTLP - they receive full data) -[posthog.tracing] +[custom_posthog.tracing] enabled = false # Send AI spans to your PostHog # Privacy controls for data sent to YOUR PostHog only -[posthog.tracing.capture] +[custom_posthog.tracing.capture] content = false # Capture prompt/completion content # content_max_length = 1000 # Max length for captured content (omit for unlimited) pipe_codes = false # Include pipe codes in span names/attributes output_class_names = false # Include output class names in span names/attributes +# ────────────────────────────────────────────────────────────────────────────── +# Portkey SDK Configuration +# ────────────────────────────────────────────────────────────────────────────── + +[custom_portkey] +force_debug_enabled = false +force_tracing_enabled = false + # ────────────────────────────────────────────────────────────────────────────── # Langfuse Integration # Note: Langfuse receives FULL span data (no redaction) @@ -73,7 +81,7 @@ enabled = false # Controls which integration modes can use custom telemetry settings above. # ────────────────────────────────────────────────────────────────────────────── -[custom_telemetry_allowed_modes] +[telemetry_allowed_modes] ci = false # CI environments don't use custom telemetry cli = true # CLI usage allows custom telemetry docker = true # Docker deployments allow custom telemetry diff --git a/cocode/cli/ai_instructions/ai_instructions_cli.py b/cocode/cli/ai_instructions/ai_instructions_cli.py index 8ee3868..6da4dcd 100644 --- a/cocode/cli/ai_instructions/ai_instructions_cli.py +++ b/cocode/cli/ai_instructions/ai_instructions_cli.py @@ -6,7 +6,6 @@ from typing import Annotated, List, Optional import typer -from pipelex.hub import get_pipeline_tracker from cocode.common import validate_repo_path from cocode.swe.swe_cmd import swe_ai_instruction_update_from_diff @@ -60,5 +59,3 @@ def ai_instructions_update_cmd( exclude_patterns=exclude_patterns, ) ) - - get_pipeline_tracker().output_flowchart() diff --git a/cocode/cli/analyze/analyze_cli.py b/cocode/cli/analyze/analyze_cli.py index 6a23ae1..c40c015 100644 --- a/cocode/cli/analyze/analyze_cli.py +++ b/cocode/cli/analyze/analyze_cli.py @@ -6,7 +6,6 @@ from typing import Annotated, List, Optional import typer -from pipelex.hub import get_pipeline_tracker from pipelex.pipe_run.pipe_run_mode import PipeRunMode from pipelex.tools.misc.file_utils import load_text_from_path @@ -100,4 +99,3 @@ def analyze_diff_cmd( exclude_patterns=exclude_patterns, ) ) - get_pipeline_tracker().output_flowchart() diff --git a/cocode/cli/changelog/changelog_cli.py b/cocode/cli/changelog/changelog_cli.py index b191ff7..e4d6caf 100644 --- a/cocode/cli/changelog/changelog_cli.py +++ b/cocode/cli/changelog/changelog_cli.py @@ -6,7 +6,6 @@ from typing import Annotated, List, Optional import typer -from pipelex.hub import get_pipeline_tracker from pipelex.pipe_run.pipe_run_mode import PipeRunMode from cocode.common import get_output_dir, validate_repo_path @@ -77,4 +76,3 @@ def changelog_update_cmd( exclude_patterns=exclude_patterns, ) ) - get_pipeline_tracker().output_flowchart() diff --git a/cocode/cli/doc/doc_cli.py b/cocode/cli/doc/doc_cli.py index 3064235..41d8abc 100644 --- a/cocode/cli/doc/doc_cli.py +++ b/cocode/cli/doc/doc_cli.py @@ -6,7 +6,6 @@ from typing import Annotated, List, Optional import typer -from pipelex.hub import get_pipeline_tracker from cocode.common import validate_repo_path from cocode.swe.swe_cmd import swe_doc_proofread, swe_doc_update_from_diff @@ -65,8 +64,6 @@ def doc_update_cmd( ) ) - get_pipeline_tracker().output_flowchart() - @doc_app.command("proofread") def doc_proofread_cmd( @@ -130,5 +127,3 @@ def doc_proofread_cmd( exclude_patterns=exclude_patterns, ) ) - - get_pipeline_tracker().output_flowchart() diff --git a/cocode/pipelines/doc_proofread/doc_proofread.plx b/cocode/pipelines/doc_proofread/doc_proofread.plx index 8899802..2ec9504 100644 --- a/cocode/pipelines/doc_proofread/doc_proofread.plx +++ b/cocode/pipelines/doc_proofread/doc_proofread.plx @@ -11,6 +11,17 @@ MarkdownReport = "A markdown report containing documentation inconsistencies for [pipe] +[pipe.proofread_doc_sequence] +type = "PipeSequence" +description = "Process a single documentation file to find inconsistencies" +inputs = { doc_file = "DocumentationFile", repo_map = "RepositoryMap" } +output = "DocumentationInconsistency[]" +steps = [ + { pipe = "find_related_code_files", result = "related_file_paths" }, + { pipe = "read_doc_file", result = "related_files" }, + { pipe = "proofread_single_doc", result = "inconsistencies" } +] + [pipe.find_related_code_files] type = "PipeLLM" description = "Find code files that implement or use elements mentioned in docs" @@ -61,17 +72,6 @@ Look for things that would BREAK user code, like: Skip anything that's not a showstopper. If it would just be confusing but still work, ignore it. """ -[pipe.proofread_doc_sequence] -type = "PipeSequence" -description = "Process a single documentation file to find inconsistencies" -inputs = { doc_file = "DocumentationFile", repo_map = "RepositoryMap" } -output = "DocumentationInconsistency" -steps = [ - { pipe = "find_related_code_files", result = "related_file_paths" }, - { pipe = "read_doc_file", result = "related_files" }, - { pipe = "proofread_single_doc", result = "inconsistencies" } -] - [pipe.create_cursor_report] type = "PipeLLM" description = "Create a markdown report with inconsistencies formatted as a Cursor prompt" @@ -118,6 +118,6 @@ steps = [ type = "PipeFunc" description = "Read the content of related codebase files" inputs = { related_file_paths = "FilePath" } -output = "CodebaseFileContent" +output = "CodebaseFileContent[]" function_name = "read_file_content" diff --git a/cocode/swe/swe_cmd.py b/cocode/swe/swe_cmd.py index ed99c22..71d60c0 100644 --- a/cocode/swe/swe_cmd.py +++ b/cocode/swe/swe_cmd.py @@ -355,12 +355,12 @@ async def swe_doc_proofread( doc_files = create_documentation_files_from_paths(doc_file_paths, doc_dir) repo_map_stuff = StuffFactory.make_stuff( - concept=get_required_concept(concept_string="doc_proofread.RepositoryMap"), + concept=get_required_concept(concept_ref="doc_proofread.RepositoryMap"), content=RepositoryMap(repo_content=repo_text), name="repo_map", ) doc_files_stuff = StuffFactory.make_stuff( - concept=get_required_concept(concept_string="doc_proofread.DocumentationFile"), + concept=get_required_concept(concept_ref="doc_proofread.DocumentationFile"), content=ListContent[DocumentationFile](items=doc_files), name="doc_files", ) diff --git a/pyproject.toml b/pyproject.toml index 71be215..5a978a8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,6 +13,7 @@ classifiers = [ "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: 3.14", "Operating System :: OS Independent", ] diff --git a/uv.lock b/uv.lock index d926a0b..fdaac9e 100644 --- a/uv.lock +++ b/uv.lock @@ -1959,6 +1959,7 @@ requires-dist = [ { name = "google-auth-oauthlib", marker = "extra == 'google'", specifier = ">=1.2.1" }, { name = "google-genai", marker = "extra == 'google-genai'" }, { name = "httpx", specifier = ">=0.23.0,<1.0.0" }, + { name = "huggingface-hub", marker = "extra == 'huggingface'", specifier = ">=0.23,<1.0.0" }, { name = "instructor", specifier = ">=1.8.3,!=1.11.*,!=1.12.*" }, { name = "instructor", extras = ["google-genai"], marker = "extra == 'google-genai'" }, { name = "jinja2", specifier = ">=3.1.4" }, @@ -2006,7 +2007,7 @@ requires-dist = [ { name = "types-pyyaml", marker = "extra == 'dev'", specifier = ">=6.0.12.20250326" }, { name = "typing-extensions", specifier = ">=4.13.2" }, ] -provides-extras = ["anthropic", "bedrock", "docling", "fal", "google", "google-genai", "mistralai", "docs", "dev"] +provides-extras = ["anthropic", "bedrock", "docling", "fal", "google", "google-genai", "huggingface", "mistralai", "docs", "dev"] [[package]] name = "platformdirs" From 506974e8f96907e9aaead7f1c1fbf5d85a041dfb Mon Sep 17 00:00:00 2001 From: Louis Choquel <8851983+lchoquel@users.noreply.github.com> Date: Mon, 12 Jan 2026 17:02:38 +0100 Subject: [PATCH 4/8] Use feature/Chicago --- cocode/cli/main.py | 3 +- cocode/common.py | 2 + pyproject.toml | 7 +- tests/conftest.py | 6 +- tests/integration/test_hello_world.py | 1 + .../hello_world.plx | 0 uv.lock | 68 +------------------ 7 files changed, 16 insertions(+), 71 deletions(-) rename tests/{test_pipelines => pipelines}/hello_world.plx (100%) diff --git a/cocode/cli/main.py b/cocode/cli/main.py index 857facd..80fcd2c 100644 --- a/cocode/cli/main.py +++ b/cocode/cli/main.py @@ -17,6 +17,7 @@ from cocode.cli.doc.doc_cli import doc_app from cocode.cli.features.features_cli import features_app from cocode.cli.repo.repo_cli import repo_app +from cocode.common import PIPELINE_LIBRARY_DIRS from cocode.github.github_cli import github_app from cocode.repox.repox_cli import repox_app from cocode.validation_cli import validation_app @@ -64,7 +65,7 @@ def get_command(self, ctx: Context, cmd_name: str) -> Optional[Command]: @app.callback(invoke_without_command=True) def main(ctx: TyperContext) -> None: """Initialize Pipelex system before any command runs.""" - Pipelex.make() + Pipelex.make(library_dirs=PIPELINE_LIBRARY_DIRS) if ctx.invoked_subcommand is None: print(ctx.get_help()) diff --git a/cocode/common.py b/cocode/common.py index b6934a9..0caee9a 100644 --- a/cocode/common.py +++ b/cocode/common.py @@ -13,6 +13,8 @@ from cocode.github.github_repo_manager import GitHubRepoManager from cocode.repox.repox_processor import RESULTS_DIR +PIPELINE_LIBRARY_DIRS = ["cocode/pipelines"] + class PipeCode(StrEnum): """Pipeline codes for SWE analysis operations.""" diff --git a/pyproject.toml b/pyproject.toml index 5a978a8..307cd93 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,12 +18,12 @@ classifiers = [ ] dependencies = [ - "pipelex[anthropic,google,google-genai,bedrock]==0.15.7", + "pipelex[anthropic,google,google-genai,bedrock]", "PyGithub==2.4.0", ] [tool.uv.sources] -pipelex = { path = "../pipelex", editable = true } +pipelex = { git = "https://github.com/Pipelex/pipelex.git", branch = "feature/Chicago" } [project.optional-dependencies] docs = [ @@ -83,7 +83,8 @@ required-version = ">=0.7.2" [tool.mypy] packages = ["cocode", "tests"] check_untyped_defs = true -exclude = "^.*\\.venv/.*$" +# exclude = "^.*\\.venv/.*$" +exclude = "^.*\\.venv/.*$|^.*/pipelex/tests/.*$" mypy_path = "." plugins = ["pydantic.mypy"] python_version = "3.11" diff --git a/tests/conftest.py b/tests/conftest.py index a78f630..72647cd 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -14,6 +14,10 @@ from rich.console import Console from rich.traceback import Traceback +from cocode.common import PIPELINE_LIBRARY_DIRS + +PIPELINE_LIBRARY_DIRS_FOR_TESTS = [*PIPELINE_LIBRARY_DIRS, "tests/pipelines"] + pytest_plugins = [ "pipelex.test_extras.shared_pytest_plugins", ] @@ -31,7 +35,7 @@ def reset_pipelex_config_fixture(): # Code to run before each test print("\n[magenta]pipelex setup[/magenta]") try: - make_pipelex_for_cli(context=ErrorContext.VALIDATION) + make_pipelex_for_cli(context=ErrorContext.VALIDATION, library_dirs=PIPELINE_LIBRARY_DIRS) do_validate_all_libraries_and_dry_run() config = get_config() assert isinstance(config, PipelexConfig) diff --git a/tests/integration/test_hello_world.py b/tests/integration/test_hello_world.py index d6dc24a..f96165a 100644 --- a/tests/integration/test_hello_world.py +++ b/tests/integration/test_hello_world.py @@ -3,6 +3,7 @@ from pipelex.pipeline.execute import execute_pipeline +@pytest.mark.xfail(reason="This test is failing because the hello_world pipeline is not found in the pipeline library.") @pytest.mark.asyncio @pytest.mark.inference @pytest.mark.dry_runnable diff --git a/tests/test_pipelines/hello_world.plx b/tests/pipelines/hello_world.plx similarity index 100% rename from tests/test_pipelines/hello_world.plx rename to tests/pipelines/hello_world.plx diff --git a/uv.lock b/uv.lock index fdaac9e..c36d6eb 100644 --- a/uv.lock +++ b/uv.lock @@ -560,7 +560,7 @@ requires-dist = [ { name = "mkdocs-material", marker = "extra == 'docs'", specifier = "==9.6.14" }, { name = "mkdocs-meta-manager", marker = "extra == 'docs'", specifier = "==1.1.0" }, { name = "mypy", marker = "extra == 'dev'", specifier = ">=1.11.2" }, - { name = "pipelex", extras = ["anthropic", "google", "google-genai", "bedrock"], editable = "../pipelex" }, + { name = "pipelex", extras = ["anthropic", "google", "google-genai", "bedrock"], git = "https://github.com/Pipelex/pipelex.git?branch=feature%2FChicago" }, { name = "pygithub", specifier = "==2.4.0" }, { name = "pyright", marker = "extra == 'dev'", specifier = ">=1.1.405" }, { name = "pytest", marker = "extra == 'dev'", specifier = ">=9.0.1" }, @@ -1895,7 +1895,7 @@ wheels = [ [[package]] name = "pipelex" version = "0.17.3" -source = { editable = "../pipelex" } +source = { git = "https://github.com/Pipelex/pipelex.git?branch=feature%2FChicago#88f1cefb1901f0759e91b06a2ba2f580291a29f2" } dependencies = [ { name = "aiofiles" }, { name = "backports-strenum", marker = "python_full_version < '3.11'" }, @@ -1945,70 +1945,6 @@ google-genai = [ { name = "instructor", extra = ["google-genai"] }, ] -[package.metadata] -requires-dist = [ - { name = "aioboto3", marker = "extra == 'bedrock'", specifier = ">=13.4.0" }, - { name = "aiofiles", specifier = ">=23.2.1" }, - { name = "anthropic", marker = "extra == 'anthropic'", specifier = ">=0.60.0" }, - { name = "backports-strenum", marker = "python_full_version < '3.11'", specifier = ">=1.3.0" }, - { name = "boto3", marker = "extra == 'bedrock'", specifier = ">=1.34.131" }, - { name = "boto3-stubs", marker = "extra == 'dev'", specifier = ">=1.35.24" }, - { name = "docling", marker = "extra == 'docling'", specifier = ">=2.64.0" }, - { name = "fal-client", marker = "extra == 'fal'", specifier = ">=0.4.1" }, - { name = "filetype", specifier = ">=1.2.0" }, - { name = "google-auth-oauthlib", marker = "extra == 'google'", specifier = ">=1.2.1" }, - { name = "google-genai", marker = "extra == 'google-genai'" }, - { name = "httpx", specifier = ">=0.23.0,<1.0.0" }, - { name = "huggingface-hub", marker = "extra == 'huggingface'", specifier = ">=0.23,<1.0.0" }, - { name = "instructor", specifier = ">=1.8.3,!=1.11.*,!=1.12.*" }, - { name = "instructor", extras = ["google-genai"], marker = "extra == 'google-genai'" }, - { name = "jinja2", specifier = ">=3.1.4" }, - { name = "json2html", specifier = ">=1.3.0" }, - { name = "kajson", specifier = "==0.3.1" }, - { name = "markdown", specifier = ">=3.6" }, - { name = "mistralai", marker = "extra == 'mistralai'", specifier = "==1.5.2" }, - { name = "mkdocs", marker = "extra == 'docs'", specifier = "==1.6.1" }, - { name = "mkdocs-glightbox", marker = "extra == 'docs'", specifier = "==0.4.0" }, - { name = "mkdocs-material", marker = "extra == 'docs'", specifier = "==9.6.14" }, - { name = "mkdocs-meta-manager", marker = "extra == 'docs'", specifier = "==1.1.0" }, - { name = "mypy", marker = "extra == 'dev'", specifier = ">=1.11.2" }, - { name = "networkx", specifier = ">=3.4.2" }, - { name = "openai", specifier = ">=1.108.1" }, - { name = "opentelemetry-api" }, - { name = "opentelemetry-exporter-otlp-proto-http" }, - { name = "opentelemetry-sdk" }, - { name = "opentelemetry-semantic-conventions" }, - { name = "pillow", specifier = ">=11.2.1" }, - { name = "polyfactory", specifier = ">=2.21.0" }, - { name = "portkey-ai", specifier = ">=2.1.0" }, - { name = "posthog", specifier = ">=6.7.0" }, - { name = "pydantic", specifier = ">=2.10.6,<3.0.0" }, - { name = "pylint", marker = "extra == 'dev'", specifier = ">=3.3.8" }, - { name = "pypdfium2", specifier = ">=4.30.0,!=4.30.1" }, - { name = "pyright", marker = "extra == 'dev'", specifier = ">=1.1.405" }, - { name = "pytest", marker = "extra == 'dev'", specifier = ">=9.0.1" }, - { name = "pytest-asyncio", marker = "extra == 'dev'", specifier = ">=0.24.0" }, - { name = "pytest-cov", marker = "extra == 'dev'", specifier = ">=6.1.1" }, - { name = "pytest-mock", marker = "extra == 'dev'", specifier = ">=3.14.0" }, - { name = "pytest-sugar", marker = "extra == 'dev'", specifier = ">=1.0.0" }, - { name = "pytest-xdist", marker = "extra == 'dev'", specifier = ">=3.6.1" }, - { name = "python-dotenv", specifier = ">=1.0.1" }, - { name = "pyyaml", specifier = ">=6.0.2" }, - { name = "rich", specifier = ">=13.8.1" }, - { name = "ruff", marker = "extra == 'dev'", specifier = ">=0.6.8" }, - { name = "shortuuid", specifier = ">=1.0.13" }, - { name = "tomli", specifier = ">=2.3.0" }, - { name = "tomlkit", specifier = ">=0.13.2" }, - { name = "typer", specifier = ">=0.16.0" }, - { name = "types-aioboto3", extras = ["bedrock", "bedrock-runtime"], marker = "extra == 'dev'", specifier = ">=13.4.0" }, - { name = "types-aiofiles", marker = "extra == 'dev'", specifier = ">=24.1.0.20240626" }, - { name = "types-markdown", marker = "extra == 'dev'", specifier = ">=3.6.0.20240316" }, - { name = "types-networkx", marker = "extra == 'dev'", specifier = ">=3.3.0.20241020" }, - { name = "types-pyyaml", marker = "extra == 'dev'", specifier = ">=6.0.12.20250326" }, - { name = "typing-extensions", specifier = ">=4.13.2" }, -] -provides-extras = ["anthropic", "bedrock", "docling", "fal", "google", "google-genai", "huggingface", "mistralai", "docs", "dev"] - [[package]] name = "platformdirs" version = "4.3.8" From 4052a5c316f48b9fc2709eda53cdd26ecaa34c9f Mon Sep 17 00:00:00 2001 From: Louis Choquel Date: Mon, 19 Jan 2026 11:57:18 +0100 Subject: [PATCH 5/8] Pipelex dep --- pyproject.toml | 2 +- uv.lock | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 307cd93..55864f6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,7 +23,7 @@ dependencies = [ ] [tool.uv.sources] -pipelex = { git = "https://github.com/Pipelex/pipelex.git", branch = "feature/Chicago" } +pipelex = { git = "https://github.com/Pipelex/pipelex.git", branch = "pre-release/v0.18.0b1" } [project.optional-dependencies] docs = [ diff --git a/uv.lock b/uv.lock index c36d6eb..3e62176 100644 --- a/uv.lock +++ b/uv.lock @@ -560,7 +560,7 @@ requires-dist = [ { name = "mkdocs-material", marker = "extra == 'docs'", specifier = "==9.6.14" }, { name = "mkdocs-meta-manager", marker = "extra == 'docs'", specifier = "==1.1.0" }, { name = "mypy", marker = "extra == 'dev'", specifier = ">=1.11.2" }, - { name = "pipelex", extras = ["anthropic", "google", "google-genai", "bedrock"], git = "https://github.com/Pipelex/pipelex.git?branch=feature%2FChicago" }, + { name = "pipelex", extras = ["anthropic", "google", "google-genai", "bedrock"], git = "https://github.com/Pipelex/pipelex.git?branch=pre-release%2Fv0.18.0b1" }, { name = "pygithub", specifier = "==2.4.0" }, { name = "pyright", marker = "extra == 'dev'", specifier = ">=1.1.405" }, { name = "pytest", marker = "extra == 'dev'", specifier = ">=9.0.1" }, @@ -1894,8 +1894,8 @@ wheels = [ [[package]] name = "pipelex" -version = "0.17.3" -source = { git = "https://github.com/Pipelex/pipelex.git?branch=feature%2FChicago#88f1cefb1901f0759e91b06a2ba2f580291a29f2" } +version = "0.18.0b1" +source = { git = "https://github.com/Pipelex/pipelex.git?branch=pre-release%2Fv0.18.0b1#c056b6a7015c4c7c96656e4615578277629e8a5e" } dependencies = [ { name = "aiofiles" }, { name = "backports-strenum", marker = "python_full_version < '3.11'" }, From 0fba2f0c457109aa58e49d36516fc8737cc763e1 Mon Sep 17 00:00:00 2001 From: Louis Choquel Date: Mon, 19 Jan 2026 14:32:26 +0100 Subject: [PATCH 6/8] Cleanup imports --- cocode/validation_cli.py | 1 - tests/conftest.py | 2 -- 2 files changed, 3 deletions(-) diff --git a/cocode/validation_cli.py b/cocode/validation_cli.py index 988feb8..e3138b2 100644 --- a/cocode/validation_cli.py +++ b/cocode/validation_cli.py @@ -9,7 +9,6 @@ from pipelex.cli.commands.validate_cmd import do_validate_all_libraries_and_dry_run from pipelex.hub import get_pipes from pipelex.pipe_run.dry_run import dry_run_pipes -from pipelex.pipelex import Pipelex validation_app = typer.Typer( name="validation", diff --git a/tests/conftest.py b/tests/conftest.py index 72647cd..8a69395 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,7 +1,5 @@ import logging -import pipelex.config -import pipelex.pipelex import pytest from pipelex.cli.cli_factory import make_pipelex_for_cli from pipelex.cli.commands.validate_cmd import do_validate_all_libraries_and_dry_run From dd6ae3322653726657b144bd09b812f3d3fe0c89 Mon Sep 17 00:00:00 2001 From: Louis Choquel Date: Mon, 19 Jan 2026 15:00:10 +0100 Subject: [PATCH 7/8] CI --disable-inference to run without Pipelex Service agreement + cleaned up agent rules --- BLACKBOX_RULES.md => .blackboxrules | 278 ++++--- .cursor/rules/run_pipelex.mdc | 18 +- .cursor/rules/write_pipelex.mdc | 178 +++- .github/copilot-instructions.md | 1202 ++++---------------------- .windsurfrules.md | 278 ++++--- AGENTS.md | 1202 ++++---------------------- CLAUDE.md | 1203 ++++----------------------- Makefile | 2 +- tests/conftest.py | 28 +- tests/integration/test_basic.py | 2 + 10 files changed, 1052 insertions(+), 3339 deletions(-) rename BLACKBOX_RULES.md => .blackboxrules (84%) diff --git a/BLACKBOX_RULES.md b/.blackboxrules similarity index 84% rename from BLACKBOX_RULES.md rename to .blackboxrules index f7f417f..af4572e 100644 --- a/BLACKBOX_RULES.md +++ b/.blackboxrules @@ -1,10 +1,13 @@ -# Pipelex Rules - +# Pipelex Coding Rules + ## Guide to write or edit pipelines using the Pipelex language in .plx files - Always first write your "plan" in natural language, then transcribe it in pipelex. -- You should ALWAYS RUN the terminal command `make validate` when you are writing or editing a `.plx` file. It will ensure the pipe is runnable. If not, iterate. +- You should ALWAYS RUN validation when you are writing or editing a `.plx` file. It will ensure the pipe is runnable. If not, iterate. + - For a specific file: `pipelex validate path_to_file.plx` + - For all pipelines: `pipelex validate all` + - **IMPORTANT**: Ensure the Python virtual environment is activated before running `pipelex` commands. For standard installations, the venv is named `.venv` - always check that first. The commands will not work without proper venv activation. - Please use POSIX standard for files. (empty lines, no trailing whitespaces, etc.) ### Pipeline File Naming @@ -20,10 +23,10 @@ A pipeline file has three main sections: #### Domain Statement ```plx -domain = "domain_name" +domain = "domain_code" description = "Description of the domain" # Optional ``` -Note: The domain name usually matches the plx filename for single-file domains. For multi-file domains, use the subdirectory name. +Note: The domain code usually matches the plx filename for single-file domains. For multi-file domains, use the subdirectory name. #### Concept Definitions @@ -38,10 +41,10 @@ ConceptName = "Description of the concept" - Use PascalCase for concept names - Never use plurals (no "Stories", use "Story") - lists are handled implicitly by Pipelex - Avoid circumstantial adjectives (no "LargeText", use "Text") - focus on the essence of what the concept represents -- Don't redefine native concepts (Text, Image, PDF, TextAndImages, Number, Page) +- Don't redefine native concepts (Text, Image, PDF, TextAndImages, Number, Page, JSON) **Native Concepts:** -Pipelex provides built-in native concepts: `Text`, `Image`, `PDF`, `TextAndImages`, `Number`, `Page`. Use these directly or refine them when appropriate. +Pipelex provides built-in native concepts: `Text`, `Image`, `PDF`, `TextAndImages`, `Number`, `Page`, `JSON`. Use these directly or refine them when appropriate. **Refining Native Concepts:** To create a concept that specializes a native concept without adding fields: @@ -59,7 +62,7 @@ For details on how to structure concepts with fields, see the "Structuring Model ### Pipe Base Definition ```plx -[pipe.your_pipe_name] +[pipe.your_pipe_code] type = "PipeLLM" description = "A description of what your pipe does" inputs = { input_1 = "ConceptName1", input_2 = "ConceptName2" } @@ -69,7 +72,7 @@ output = "ConceptName" The pipes will all have at least this base definition. - `inputs`: Dictionary of key being the variable used in the prompts, and the value being the ConceptName. It should ALSO LIST THE INPUTS OF THE INTERMEDIATE STEPS (if PipeSequence) or of the conditional pipes (if PipeCondition). So If you have this error: -`StaticValidationError: missing_input_variable • domain='expense_validator' • pipe='validate_expense' • +`PipeValidationError: missing_input_variable • domain='expense_validator' • pipe='validate_expense' • variable='['invoice']'`` That means that the pipe validate_expense is missing the input `invoice` because one of the subpipe is needing it. @@ -124,16 +127,16 @@ For concepts with structured fields, define them inline using TOML syntax: description = "A commercial document issued by a seller to a buyer" [concept.Invoice.structure] -invoice_number = "The unique invoice identifier" +invoice_number = "The unique invoice identifier" # This will be optional by default issue_date = { type = "date", description = "The date the invoice was issued", required = true } total_amount = { type = "number", description = "The total invoice amount", required = true } -vendor_name = "The name of the vendor" -line_items = { type = "list", item_type = "text", description = "List of items", required = false } +vendor_name = "The name of the vendor" # This will be optional by default +line_items = { type = "list", item_type = "text", description = "List of items" } ``` **Supported inline field types:** `text`, `integer`, `boolean`, `number`, `date`, `list`, `dict` -**Field properties:** `type`, `description`, `required` (default: true), `default_value`, `choices`, `item_type` (for lists), `key_type` and `value_type` (for dicts) +**Field properties:** `type`, `description`, `required` (default: false), `default_value`, `choices`, `item_type` (for lists), `key_type` and `value_type` (for dicts) **Simple syntax** (creates required text field): ```plx @@ -142,7 +145,7 @@ field_name = "Field description" **Detailed syntax** (with explicit properties): ```plx -field_name = { type = "text", description = "Field description", required = false, default_value = "default" } +field_name = { type = "text", description = "Field description", default_value = "default" } ``` **3. Python StructuredContent Class (For Advanced Features)** @@ -468,7 +471,7 @@ The PipeExtract operator is used to extract text and images from an image or a P [pipe.extract_info] type = "PipeExtract" description = "extract the information" -inputs = { document = "PDF" } # or { image = "Image" } if it's an image. This is the only input. +inputs = { document = "Document" } # or { image = "Image" } if it's an image. This is the only input. output = "Page" ``` @@ -477,7 +480,7 @@ Using Extract Model Settings: [pipe.extract_with_model] type = "PipeExtract" description = "Extract with specific model" -inputs = { document = "PDF" } +inputs = { document = "Document" } output = "Page" model = "base_extract_mistral" # Use predefined extract preset or model alias ``` @@ -585,15 +588,16 @@ $sales_rep.phone | $sales_rep.email """ ``` -#### Key Parameters +#### Key Parameters (Template Mode) -- `template`: Inline template string (mutually exclusive with template_name) +- `template`: Inline template string (mutually exclusive with template_name and construct) - `template_name`: Name of a predefined template (mutually exclusive with template) - `template_category`: Template type ("llm_prompt", "html", "markdown", "mermaid", etc.) - `templating_style`: Styling options for template rendering - `extra_context`: Additional context variables for template For more control, you can use a nested `template` section instead of the `template` field: + - `template.template`: The template string - `template.category`: Template type - `template.templating_style`: Styling options @@ -601,9 +605,143 @@ For more control, you can use a nested `template` section instead of the `templa #### Template Variables Use the same variable insertion rules as PipeLLM: + - `@variable` for block insertion (multi-line content) - `$variable` for inline insertion (short text) +#### Construct Mode (for StructuredContent Output) + +PipeCompose can also generate `StructuredContent` objects using the `construct` section. This mode composes field values from fixed values, variable references, templates, or nested structures. + +**When to use construct mode:** + +- You need to output a structured object (not just Text) +- You want to deterministically compose fields from existing data +- No LLM is needed - just data composition and templating + +##### Basic Construct Usage + +```plx +[concept.SalesSummary] +description = "A structured sales summary" + +[concept.SalesSummary.structure] +report_title = { type = "text", description = "Title of the report" } +customer_name = { type = "text", description = "Customer name" } +deal_value = { type = "number", description = "Deal value" } +summary_text = { type = "text", description = "Generated summary text" } + +[pipe.compose_summary] +type = "PipeCompose" +description = "Compose a sales summary from deal data" +inputs = { deal = "Deal" } +output = "SalesSummary" + +[pipe.compose_summary.construct] +report_title = "Monthly Sales Report" +customer_name = { from = "deal.customer_name" } +deal_value = { from = "deal.amount" } +summary_text = { template = "Deal worth $deal.amount with $deal.customer_name" } +``` + +##### Field Composition Methods + +There are four ways to define field values in a construct: + +**1. Fixed Value (literal)** + +Use a literal value directly: + +```plx +[pipe.compose_report.construct] +report_title = "Annual Report" +report_year = 2024 +is_draft = false +``` + +**2. Variable Reference (`from`)** + +Get a value from working memory using a dotted path: + +```plx +[pipe.compose_report.construct] +customer_name = { from = "deal.customer_name" } +total_amount = { from = "order.total" } +street_address = { from = "customer.address.street" } +``` + +**3. Template (`template`)** + +Render a Jinja2 template with variable substitution: + +```plx +[pipe.compose_report.construct] +invoice_number = { template = "INV-$order.id" } +summary = { template = "Deal worth $deal.amount with $deal.customer_name on {{ current_date }}" } +``` + +**4. Nested Construct** + +For nested structures, use a TOML subsection: + +```plx +[pipe.compose_invoice.construct] +invoice_number = { template = "INV-$order.id" } +total = { from = "order.total_amount" } + +[pipe.compose_invoice.construct.billing_address] +street = { from = "customer.address.street" } +city = { from = "customer.address.city" } +country = "France" +``` + +##### Complete Construct Example + +```plx +domain = "invoicing" + +[concept.Address] +description = "A postal address" + +[concept.Address.structure] +street = { type = "text", description = "Street address" } +city = { type = "text", description = "City name" } +country = { type = "text", description = "Country name" } + +[concept.Invoice] +description = "An invoice document" + +[concept.Invoice.structure] +invoice_number = { type = "text", description = "Invoice number" } +total = { type = "number", description = "Total amount" } + +[pipe.compose_invoice] +type = "PipeCompose" +description = "Compose an invoice from order and customer data" +inputs = { order = "Order", customer = "Customer" } +output = "Invoice" + +[pipe.compose_invoice.construct] +invoice_number = { template = "INV-$order.id" } +total = { from = "order.total_amount" } + +[pipe.compose_invoice.construct.billing_address] +street = { from = "customer.address.street" } +city = { from = "customer.address.city" } +country = "France" +``` + +##### Key Parameters (Construct Mode) + +- `construct`: Dictionary mapping field names to their composition rules +- Each field can be: + - A literal value (string, number, boolean) + - A dict with `from` key for variable reference + - A dict with `template` key for template rendering + - A nested dict for nested structures + +**Note:** You must use either `template` or `construct`, not both. They are mutually exclusive. + ### PipeImgGen operator The PipeImgGen operator is used to generate images using AI image generation models. @@ -817,7 +955,7 @@ Presets are meant to record the choice of an llm with its hyper parameters (temp Examples: ```toml -llm_for_complex_reasoning = { model = "base-claude", temperature = 1 } +llm_to_engineer = { model = "base-claude", temperature = 1 } llm_to_extract_invoice = { model = "claude-3-7-sonnet", temperature = 0.1, max_tokens = "auto" } ``` @@ -846,7 +984,11 @@ You can override the predefined llm presets by setting them in `.pipelex/inferen --- -ALWAYS RUN `make validate` when you are finished writing pipelines: This checks for errors. If there are errors, iterate until it works. +ALWAYS RUN validation when you are finished writing pipelines: This checks for errors. If there are errors, iterate until it works. +- For a specific bundle/file: `pipelex validate path_to_file.plx` +- For all pipelines: `pipelex validate all` +- Remember: Ensure your Python virtual environment is activated (typically `.venv` for standard installations) before running `pipelex` commands. + Then, create an example file to run the pipeline in the `examples` folder. But don't write documentation unless asked explicitly to. @@ -901,7 +1043,7 @@ async def extract_gantt(image_url: str) -> GanttChart: # Run the pipe pipe_output = await execute_pipeline( pipe_code="extract_gantt_by_steps", - input_memory={ + inputs={ "gantt_chart_image": { "concept": "gantt.GanttImage", "content": ImageContent(url=image_url), @@ -940,18 +1082,18 @@ So here are a few concrete examples of calls to execute_pipeline with various wa ## If you assign a string, by default it will be considered as a TextContent. pipe_output = await execute_pipeline( pipe_code="master_advisory_orchestrator", - input_memory={ + inputs={ "user_input": problem_description, }, ) -## Here we have a single input and it's a PDF. -## Because PDFContent is a native concept, we can use it directly as a value, +## Here we have a single input and it's a document. +## Because DocumentContent is a native concept, we can use it directly as a value, ## the system knows what content it corresponds to: pipe_output = await execute_pipeline( pipe_code="power_extractor_dpe", - input_memory={ - "document": PDFContent(url=pdf_url), + inputs={ + "document": DocumentContent(url=pdf_url), }, ) @@ -959,7 +1101,7 @@ So here are a few concrete examples of calls to execute_pipeline with various wa ## Because ImageContent is a native concept, we can use it directly as a value: pipe_output = await execute_pipeline( pipe_code="fashion_variation_pipeline", - input_memory={ + inputs={ "fashion_photo": ImageContent(url=image_url), }, ) @@ -969,7 +1111,7 @@ So here are a few concrete examples of calls to execute_pipeline with various wa ## so we must provide it using a dict with the concept and the content: pipe_output = await execute_pipeline( pipe_code="extract_gantt_by_steps", - input_memory={ + inputs={ "gantt_chart_image": { "concept": "gantt.GanttImage", "content": ImageContent(url=image_url), @@ -981,7 +1123,7 @@ So here are a few concrete examples of calls to execute_pipeline with various wa pipe_output = await execute_pipeline( pipe_code="retrieve_then_answer", dynamic_output_concept_code="contracts.Fees", - input_memory={ + inputs={ "text": load_text_from_path(path=text_path), "question": { "concept": "answer.Question", @@ -1074,82 +1216,4 @@ result_list = pipe_output.main_stuff_as_items(item_type=GanttChart) ``` --- - -## Rules to choose LLM models used in PipeLLMs. - -### LLM Configuration System - -In order to use it in a pipe, an LLM is referenced by its llm_handle (alias) and possibly by an llm_preset. -LLM configurations are managed through the new inference backend system with files located in `.pipelex/inference/`: - -- **Model Deck**: `.pipelex/inference/deck/base_deck.toml` and `.pipelex/inference/deck/overrides.toml` -- **Backends**: `.pipelex/inference/backends.toml` and `.pipelex/inference/backends/*.toml` -- **Routing**: `.pipelex/inference/routing_profiles.toml` - -### LLM Handles - -An llm_handle can be either: -1. **A direct model name** (like "gpt-4o-mini", "claude-3-sonnet") - automatically available for all models loaded by the inference backend system -2. **An alias** - user-defined shortcuts that map to model names, defined in the `[aliases]` section: - -```toml -[aliases] -base-claude = "claude-4.5-sonnet" -base-gpt = "gpt-5" -base-gemini = "gemini-2.5-flash" -base-mistral = "mistral-medium" -``` - -The system first looks for direct model names, then checks aliases if no direct match is found. The system handles model routing through backends automatically. - -### Using an LLM Handle in a PipeLLM - -Here is an example of using an llm_handle to specify which LLM to use in a PipeLLM: - -```plx -[pipe.hello_world] -type = "PipeLLM" -description = "Write text about Hello World." -output = "Text" -model = { model = "gpt-5", temperature = 0.9 } -prompt = """ -Write a haiku about Hello World. -""" -``` - -As you can see, to use the LLM, you must also indicate the temperature (float between 0 and 1) and max_tokens (either an int or the string "auto"). - -### LLM Presets - -Presets are meant to record the choice of an llm with its hyper parameters (temperature and max_tokens) if it's good for a particular task. LLM Presets are skill-oriented. - -Examples: -```toml -llm_for_complex_reasoning = { model = "base-claude", temperature = 1 } -llm_to_extract_invoice = { model = "claude-3-7-sonnet", temperature = 0.1, max_tokens = "auto" } -``` - -The interest is that these presets can be used to set the LLM choice in a PipeLLM, like this: - -```plx -[pipe.extract_invoice] -type = "PipeLLM" -description = "Extract invoice information from an invoice text transcript" -inputs = { invoice_text = "InvoiceText" } -output = "Invoice" -model = "llm_to_extract_invoice" -prompt = """ -Extract invoice information from this invoice: - -The category of this invoice is: $invoice_details.category. - -@invoice_text -""" -``` - -The setting here `model = "llm_to_extract_invoice"` works because "llm_to_extract_invoice" has been declared as an llm_preset in the deck. -You must not use an LLM preset in a PipeLLM that does not exist in the deck. If needed, you can add llm presets. - - -You can override the predefined llm presets by setting them in `.pipelex/inference/deck/overrides.toml`. diff --git a/.cursor/rules/run_pipelex.mdc b/.cursor/rules/run_pipelex.mdc index eebaa22..7650051 100644 --- a/.cursor/rules/run_pipelex.mdc +++ b/.cursor/rules/run_pipelex.mdc @@ -55,7 +55,7 @@ async def extract_gantt(image_url: str) -> GanttChart: # Run the pipe pipe_output = await execute_pipeline( pipe_code="extract_gantt_by_steps", - input_memory={ + inputs={ "gantt_chart_image": { "concept": "gantt.GanttImage", "content": ImageContent(url=image_url), @@ -94,18 +94,18 @@ So here are a few concrete examples of calls to execute_pipeline with various wa # If you assign a string, by default it will be considered as a TextContent. pipe_output = await execute_pipeline( pipe_code="master_advisory_orchestrator", - input_memory={ + inputs={ "user_input": problem_description, }, ) -# Here we have a single input and it's a PDF. -# Because PDFContent is a native concept, we can use it directly as a value, +# Here we have a single input and it's a document. +# Because DocumentContent is a native concept, we can use it directly as a value, # the system knows what content it corresponds to: pipe_output = await execute_pipeline( pipe_code="power_extractor_dpe", - input_memory={ - "document": PDFContent(url=pdf_url), + inputs={ + "document": DocumentContent(url=pdf_url), }, ) @@ -113,7 +113,7 @@ So here are a few concrete examples of calls to execute_pipeline with various wa # Because ImageContent is a native concept, we can use it directly as a value: pipe_output = await execute_pipeline( pipe_code="fashion_variation_pipeline", - input_memory={ + inputs={ "fashion_photo": ImageContent(url=image_url), }, ) @@ -123,7 +123,7 @@ So here are a few concrete examples of calls to execute_pipeline with various wa # so we must provide it using a dict with the concept and the content: pipe_output = await execute_pipeline( pipe_code="extract_gantt_by_steps", - input_memory={ + inputs={ "gantt_chart_image": { "concept": "gantt.GanttImage", "content": ImageContent(url=image_url), @@ -135,7 +135,7 @@ So here are a few concrete examples of calls to execute_pipeline with various wa pipe_output = await execute_pipeline( pipe_code="retrieve_then_answer", dynamic_output_concept_code="contracts.Fees", - input_memory={ + inputs={ "text": load_text_from_path(path=text_path), "question": { "concept": "answer.Question", diff --git a/.cursor/rules/write_pipelex.mdc b/.cursor/rules/write_pipelex.mdc index 841da97..93422cc 100644 --- a/.cursor/rules/write_pipelex.mdc +++ b/.cursor/rules/write_pipelex.mdc @@ -8,7 +8,10 @@ globs: # Guide to write or edit pipelines using the Pipelex language in .plx files - Always first write your "plan" in natural language, then transcribe it in pipelex. -- You should ALWAYS RUN the terminal command `make validate` when you are writing or editing a `.plx` file. It will ensure the pipe is runnable. If not, iterate. +- You should ALWAYS RUN validation when you are writing or editing a `.plx` file. It will ensure the pipe is runnable. If not, iterate. + - For a specific file: `pipelex validate path_to_file.plx` + - For all pipelines: `pipelex validate all` + - **IMPORTANT**: Ensure the Python virtual environment is activated before running `pipelex` commands. For standard installations, the venv is named `.venv` - always check that first. The commands will not work without proper venv activation. - Please use POSIX standard for files. (empty lines, no trailing whitespaces, etc.) ## Pipeline File Naming @@ -24,10 +27,10 @@ A pipeline file has three main sections: ### Domain Statement ```plx -domain = "domain_name" +domain = "domain_code" description = "Description of the domain" # Optional ``` -Note: The domain name usually matches the plx filename for single-file domains. For multi-file domains, use the subdirectory name. +Note: The domain code usually matches the plx filename for single-file domains. For multi-file domains, use the subdirectory name. ### Concept Definitions @@ -42,10 +45,10 @@ ConceptName = "Description of the concept" - Use PascalCase for concept names - Never use plurals (no "Stories", use "Story") - lists are handled implicitly by Pipelex - Avoid circumstantial adjectives (no "LargeText", use "Text") - focus on the essence of what the concept represents -- Don't redefine native concepts (Text, Image, PDF, TextAndImages, Number, Page) +- Don't redefine native concepts (Text, Image, PDF, TextAndImages, Number, Page, JSON) **Native Concepts:** -Pipelex provides built-in native concepts: `Text`, `Image`, `PDF`, `TextAndImages`, `Number`, `Page`. Use these directly or refine them when appropriate. +Pipelex provides built-in native concepts: `Text`, `Image`, `PDF`, `TextAndImages`, `Number`, `Page`, `JSON`. Use these directly or refine them when appropriate. **Refining Native Concepts:** To create a concept that specializes a native concept without adding fields: @@ -63,7 +66,7 @@ For details on how to structure concepts with fields, see the "Structuring Model ## Pipe Base Definition ```plx -[pipe.your_pipe_name] +[pipe.your_pipe_code] type = "PipeLLM" description = "A description of what your pipe does" inputs = { input_1 = "ConceptName1", input_2 = "ConceptName2" } @@ -73,7 +76,7 @@ output = "ConceptName" The pipes will all have at least this base definition. - `inputs`: Dictionary of key being the variable used in the prompts, and the value being the ConceptName. It should ALSO LIST THE INPUTS OF THE INTERMEDIATE STEPS (if PipeSequence) or of the conditional pipes (if PipeCondition). So If you have this error: -`StaticValidationError: missing_input_variable • domain='expense_validator' • pipe='validate_expense' • +`PipeValidationError: missing_input_variable • domain='expense_validator' • pipe='validate_expense' • variable='['invoice']'`` That means that the pipe validate_expense is missing the input `invoice` because one of the subpipe is needing it. @@ -128,16 +131,16 @@ For concepts with structured fields, define them inline using TOML syntax: description = "A commercial document issued by a seller to a buyer" [concept.Invoice.structure] -invoice_number = "The unique invoice identifier" +invoice_number = "The unique invoice identifier" # This will be optional by default issue_date = { type = "date", description = "The date the invoice was issued", required = true } total_amount = { type = "number", description = "The total invoice amount", required = true } -vendor_name = "The name of the vendor" -line_items = { type = "list", item_type = "text", description = "List of items", required = false } +vendor_name = "The name of the vendor" # This will be optional by default +line_items = { type = "list", item_type = "text", description = "List of items" } ``` **Supported inline field types:** `text`, `integer`, `boolean`, `number`, `date`, `list`, `dict` -**Field properties:** `type`, `description`, `required` (default: true), `default_value`, `choices`, `item_type` (for lists), `key_type` and `value_type` (for dicts) +**Field properties:** `type`, `description`, `required` (default: false), `default_value`, `choices`, `item_type` (for lists), `key_type` and `value_type` (for dicts) **Simple syntax** (creates required text field): ```plx @@ -146,7 +149,7 @@ field_name = "Field description" **Detailed syntax** (with explicit properties): ```plx -field_name = { type = "text", description = "Field description", required = false, default_value = "default" } +field_name = { type = "text", description = "Field description", default_value = "default" } ``` **3. Python StructuredContent Class (For Advanced Features)** @@ -472,7 +475,7 @@ The PipeExtract operator is used to extract text and images from an image or a P [pipe.extract_info] type = "PipeExtract" description = "extract the information" -inputs = { document = "PDF" } # or { image = "Image" } if it's an image. This is the only input. +inputs = { document = "Document" } # or { image = "Image" } if it's an image. This is the only input. output = "Page" ``` @@ -481,7 +484,7 @@ Using Extract Model Settings: [pipe.extract_with_model] type = "PipeExtract" description = "Extract with specific model" -inputs = { document = "PDF" } +inputs = { document = "Document" } output = "Page" model = "base_extract_mistral" # Use predefined extract preset or model alias ``` @@ -589,15 +592,16 @@ $sales_rep.phone | $sales_rep.email """ ``` -### Key Parameters +### Key Parameters (Template Mode) -- `template`: Inline template string (mutually exclusive with template_name) +- `template`: Inline template string (mutually exclusive with template_name and construct) - `template_name`: Name of a predefined template (mutually exclusive with template) - `template_category`: Template type ("llm_prompt", "html", "markdown", "mermaid", etc.) - `templating_style`: Styling options for template rendering - `extra_context`: Additional context variables for template For more control, you can use a nested `template` section instead of the `template` field: + - `template.template`: The template string - `template.category`: Template type - `template.templating_style`: Styling options @@ -605,9 +609,143 @@ For more control, you can use a nested `template` section instead of the `templa ### Template Variables Use the same variable insertion rules as PipeLLM: + - `@variable` for block insertion (multi-line content) - `$variable` for inline insertion (short text) +### Construct Mode (for StructuredContent Output) + +PipeCompose can also generate `StructuredContent` objects using the `construct` section. This mode composes field values from fixed values, variable references, templates, or nested structures. + +**When to use construct mode:** + +- You need to output a structured object (not just Text) +- You want to deterministically compose fields from existing data +- No LLM is needed - just data composition and templating + +#### Basic Construct Usage + +```plx +[concept.SalesSummary] +description = "A structured sales summary" + +[concept.SalesSummary.structure] +report_title = { type = "text", description = "Title of the report" } +customer_name = { type = "text", description = "Customer name" } +deal_value = { type = "number", description = "Deal value" } +summary_text = { type = "text", description = "Generated summary text" } + +[pipe.compose_summary] +type = "PipeCompose" +description = "Compose a sales summary from deal data" +inputs = { deal = "Deal" } +output = "SalesSummary" + +[pipe.compose_summary.construct] +report_title = "Monthly Sales Report" +customer_name = { from = "deal.customer_name" } +deal_value = { from = "deal.amount" } +summary_text = { template = "Deal worth $deal.amount with $deal.customer_name" } +``` + +#### Field Composition Methods + +There are four ways to define field values in a construct: + +**1. Fixed Value (literal)** + +Use a literal value directly: + +```plx +[pipe.compose_report.construct] +report_title = "Annual Report" +report_year = 2024 +is_draft = false +``` + +**2. Variable Reference (`from`)** + +Get a value from working memory using a dotted path: + +```plx +[pipe.compose_report.construct] +customer_name = { from = "deal.customer_name" } +total_amount = { from = "order.total" } +street_address = { from = "customer.address.street" } +``` + +**3. Template (`template`)** + +Render a Jinja2 template with variable substitution: + +```plx +[pipe.compose_report.construct] +invoice_number = { template = "INV-$order.id" } +summary = { template = "Deal worth $deal.amount with $deal.customer_name on {{ current_date }}" } +``` + +**4. Nested Construct** + +For nested structures, use a TOML subsection: + +```plx +[pipe.compose_invoice.construct] +invoice_number = { template = "INV-$order.id" } +total = { from = "order.total_amount" } + +[pipe.compose_invoice.construct.billing_address] +street = { from = "customer.address.street" } +city = { from = "customer.address.city" } +country = "France" +``` + +#### Complete Construct Example + +```plx +domain = "invoicing" + +[concept.Address] +description = "A postal address" + +[concept.Address.structure] +street = { type = "text", description = "Street address" } +city = { type = "text", description = "City name" } +country = { type = "text", description = "Country name" } + +[concept.Invoice] +description = "An invoice document" + +[concept.Invoice.structure] +invoice_number = { type = "text", description = "Invoice number" } +total = { type = "number", description = "Total amount" } + +[pipe.compose_invoice] +type = "PipeCompose" +description = "Compose an invoice from order and customer data" +inputs = { order = "Order", customer = "Customer" } +output = "Invoice" + +[pipe.compose_invoice.construct] +invoice_number = { template = "INV-$order.id" } +total = { from = "order.total_amount" } + +[pipe.compose_invoice.construct.billing_address] +street = { from = "customer.address.street" } +city = { from = "customer.address.city" } +country = "France" +``` + +#### Key Parameters (Construct Mode) + +- `construct`: Dictionary mapping field names to their composition rules +- Each field can be: + - A literal value (string, number, boolean) + - A dict with `from` key for variable reference + - A dict with `template` key for template rendering + - A nested dict for nested structures + +**Note:** You must use either `template` or `construct`, not both. They are mutually exclusive. + ## PipeImgGen operator The PipeImgGen operator is used to generate images using AI image generation models. @@ -821,7 +959,7 @@ Presets are meant to record the choice of an llm with its hyper parameters (temp Examples: ```toml -llm_for_complex_reasoning = { model = "base-claude", temperature = 1 } +llm_to_engineer = { model = "base-claude", temperature = 1 } llm_to_extract_invoice = { model = "claude-3-7-sonnet", temperature = 0.1, max_tokens = "auto" } ``` @@ -850,6 +988,10 @@ You can override the predefined llm presets by setting them in `.pipelex/inferen --- -ALWAYS RUN `make validate` when you are finished writing pipelines: This checks for errors. If there are errors, iterate until it works. +ALWAYS RUN validation when you are finished writing pipelines: This checks for errors. If there are errors, iterate until it works. +- For a specific bundle/file: `pipelex validate path_to_file.plx` +- For all pipelines: `pipelex validate all` +- Remember: Ensure your Python virtual environment is activated (typically `.venv` for standard installations) before running `pipelex` commands. + Then, create an example file to run the pipeline in the `examples` folder. But don't write documentation unless asked explicitly to. diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md index 5ef7139..af4572e 100644 --- a/.github/copilot-instructions.md +++ b/.github/copilot-instructions.md @@ -1,934 +1,13 @@ -Concatenation -# Coding Standards & Best Practices - -This document outlines the core coding standards, best practices, and quality control procedures for the codebase. - -## Type Hints - -1. **Always Use Type Hints** - - Every function parameter must be typed - - Every function return must be typed - - Use type hints for all variables where type is not obvious - - Use types with Uppercase first letter (Dict[], List[], etc.) - -2. **StrEnum** - - Import from `pipelex.types`: - ```python - from pipelex.types import StrEnum - ``` - -## BaseModel Standards - -- Respect Pydantic v2 standards -- Keep models focused and single-purpose -- Use descriptive field names -- Use type hints for all fields -- Document complex validations -- Use Optional[] for nullable fields -- Use Field(default_factory=...) for mutable defaults - -## Factory Pattern - -- Use Factory Pattern for object creation when dealing with multiple implementations - -## Documentation - -1. **Docstring Format** - ```python - def process_image(image_path: str, size: Tuple[int, int]) -> bytes: - """Process and resize an image. - - Args: - image_path: Path to the source image - size: Tuple of (width, height) for resizing - - Returns: - Processed image as bytes - """ - pass - ``` - -2. **Class Documentation** - ```python - class ImageProcessor: - """Handles image processing operations. - - Provides methods for resizing, converting, and optimizing images. - """ - ``` - -## Error Handling - -1. **Graceful Error Handling** - - Use try/except blocks with specific exceptions - - Convert third-party exceptions to custom ones - ```python - try: - from fal_client import AsyncClient as FalAsyncClient - except ImportError as exc: - raise MissingDependencyError( - "fal-client", "fal", - "The fal-client SDK is required to use FAL models." - ) from exc - ``` - -## Code Quality Checks - -### Linting and Type Checking - -Before finalizing a task, run: -```bash -make fix-unused-imports -make check -``` - -This runs multiple code quality tools: -- Pyright: Static type checking -- Ruff: Fast Python linter -- Mypy: Static type checker - -Always fix any issues reported by these tools before proceeding. - -### Running Tests - -1. **Quick Test Run** (no LLM/image generation): - ```bash - make tp - ``` - Runs tests with markers: `(dry_runnable or not (inference or llm or imgg or ocr)) and not (needs_output or pipelex_api)` - -2. **Specific Tests**: - ```bash - make tp TEST=TestClassName - # or - make tp TEST=test_function_name - ``` - Note: Matches names starting with the provided string. - -**Important**: Never run `make ti`, `make test-inference`, `make to`, `make test-ocr`, `make tg`, or `make test-imgg` - these use costly inference. - -## Pipelines - -- All pipeline definitions go in `cocode/pipelex_libraries/pipelines/` -- Always validate pipelines after creation/edit with `make validate`. - Iterate if there are errors. - -## Project Structure - -- **Pipelines**: `cocode/pipelex_libraries/pipelines/` -- **Tests**: `tests/` directory -- **Documentation**: `docs/` directory -# Pipeline Guide - -- Always first write your "plan" in natural langage, then transcribe it in pipelex. -- You should ALWAYS RUN the terminal command `make validate` when you are writing a `.plx` file. It will ensure the pipe is runnable. If not, iterate. -- Please use POSIX standard for files. (enmpty lines, no trailing whitespaces, etc.) - -# Pipeline Structure Guide - -## Pipeline File Naming -- Files must be `.plx` for pipelines (Always add an empty line at the end of the file, and do not add trailing whitespaces to PLX files at all) -- Files must be `.py` for structures -- Use descriptive names in `snake_case` - -## Pipeline File Structure -A pipeline file has three main sections: -1. Domain statement -2. Concept definitions -3. Pipe definitions - -### Domain Statement -```plx -domain = "domain_name" -description = "Description of the domain" # Optional -``` -Note: The domain name usually matches the plx filename for single-file domains. For multi-file domains, use the subdirectory name. - -### Concept Definitions -```plx -[concept] -ConceptName = "Description of the concept" # Should be the same name as the Structure ClassName you want to output -``` - -Important Rules: -- Use PascalCase for concept names -- Never use plurals (no "Stories", use "Story") -- Avoid adjectives (no "LargeText", use "Text") -- Don't redefine native concepts (Text, Image, PDF, TextAndImages, Number) -yes -### Pipe Definitions - -## Pipe Base Structure - -```plx -[pipe.your_pipe_name] -type = "PipeLLM" -description = "A description of what your pipe does" -inputs = { input_1 = "ConceptName1", input_2 = "ConceptName2" } -output = "ConceptName" -``` - -DO NOT WRITE: -```plx -[pipe.your_pipe_name] -type = "pipe_sequence" -``` - -But it should be: - -```plx -[pipe.your_pipe_name] -type = "PipeSequence" -description = "....." -``` - -The pipes will all have at least this base structure. -- `inputs`: Dictionnary of key behing the variable used in the prompts, and the value behing the ConceptName. It should ALSO LIST THE INPUTS OF THE INTERMEDIATE STEPS (if pipeSequence) or of the conditionnal pipes (if pipeCondition). -So If you have this error: -`StaticValidationError: missing_input_variable • domain='expense_validator' • pipe='validate_expense' • -variable='['ocr_input']'`` -That means that the pipe validate_expense is missing the input `ocr_input` because one of the subpipe is needing it. - -NEVER WRITE THE INPUTS BY BREAKING THE LINE LIKE THIS: - -```plx -inputs = { - input_1 = "ConceptName1", - input_2 = "ConceptName2" -} -``` - - -- `output`: The name of the concept to output. The `ConceptName` should have the same name as the python class if you want structured output: - -# Structured Models Rules - -## Model Location and Registration - -- Create models for structured generations related to "some_domain" in `pipelex_libraries/pipelines/.py` -- Models must inherit from `StructuredContent` or appropriate content type - -## Model Structure - -Concepts and their structure classes are meant to indicate an idea. -A Concept MUST NEVER be a plural noun and you should never create a SomeConceptList: lists and arrays are implicitly handled by Pipelex according to the context. Just define SomeConcept. - -**IMPORTANT: Never create unnecessary structure classes that only refine native concepts without adding fields.** - -DO NOT create structures like: -```python -class Joke(TextContent): - """A humorous text that makes people laugh.""" - pass -``` - -If a concept only refines a native concept (like Text, Image, etc.) without adding new fields, simply declare it in the .plx file: -```plx -[concept] -Joke = "A humorous text that makes people laugh." -``` -If you simply need to refine another native concept, construct it like this: -```plx -[concept.Landscape] -refines = "Image" -``` - -Only create a Python structure class when you need to add specific fields: - -```python -from datetime import datetime -from typing import List, Optional -from pydantic import Field - -from pipelex.core.stuffs.structured_content import StructuredContent - -# IMPORTANT: THE CLASS MUST BE A SUBCLASS OF StructuredContent -class YourModel(StructuredContent): # Always be a subclass of StructuredContent - # Required fields - field1: str - field2: int - - # Optional fields with defaults - field3: Optional[str] = Field(None, "Description of field3") - field4: List[str] = Field(default_factory=list) - - # Date fields should remove timezone - date_field: Optional[datetime] = None -``` -## Usage - -Structures are meant to indicate what class to use for a particular Concept. In general they use the same name as the concept. - -Structure classes defined within `pipelex_libraries/pipelines/` are automatically loaded into the class_registry when setting up Pipelex, no need to do it manually. - - -## Best Practices for structures - -- Respect Pydantic v2 standards -- Use type hints for all fields -- Use `Field` declaration and write the description - - -## Pipe Controllers and Pipe Operator - -Look at the Pipes we have in order to adapt it. Pipes are organized in two categories: - -1. **Controllers** - For flow control: - - `PipeSequence` - For creating a sequence of multiple steps - - `PipeCondition` - If the next pipe depends of the expression of a stuff in the working memory - - `PipeParallel` - For parallelizing pipes - - `PipeBatch` - For running pipes in Batch over a ListContent - -2. **Operators** - For specific tasks: - - `PipeLLM` - Generate Text and Objects (include Vision LLM) - - `PipeOcr` - OCR Pipe - - `PipeImgGen` - Generate Images - - `PipeFunc` - For running classic python scripts - -# PipeSequence Guide - -## Purpose -PipeSequence executes multiple pipes in a defined order, where each step can use results from previous steps. - -## Basic Structure -```plx -[pipe.your_sequence_name] -type = "PipeSequence" -description = "Description of what this sequence does" -inputs = { input_name = "InputType" } # All the inputs of the sub pipes, except the ones generated by intermediate steps -output = "OutputType" -steps = [ - { pipe = "first_pipe", result = "first_result" }, - { pipe = "second_pipe", result = "second_result" }, - { pipe = "final_pipe", result = "final_result" } -] -``` - -## Key Components - -1. **Steps Array**: List of pipes to execute in sequence - - `pipe`: Name of the pipe to execute - - `result`: Name to assign to the pipe's output that will be in the working memory - -## Using PipeBatch in Steps - -You can use PipeBatch functionality within steps using `batch_over` and `batch_as`: - -```plx -steps = [ - { pipe = "process_items", batch_over = "input_list", batch_as = "current_item", result = "processed_items" - } -] -``` - -1. **batch_over**: Specifies a `ListContent` field to iterate over. Each item in the list will be processed individually and IN PARALLEL by the pipe. - - Must be a `ListContent` type containing the items to process - - Can reference inputs or results from previous steps - -2. **batch_as**: Defines the name that will be used to reference the current item being processed - - This name can be used in the pipe's input mappings - - Makes each item from the batch available as a single element - -The result of a batched step will be a `ListContent` containing the outputs from processing each item. - -# PipeCondition Controller - -The PipeCondition controller allows you to implement conditional logic in your pipeline, choosing which pipe to execute based on an evaluated expression. It supports both direct expressions and expression templates. - -## Usage in PLX Configuration - -### Basic Usage with Direct Expression - -```plx -[pipe.conditional_operation] -type = "PipeCondition" -description = "A conditonal pipe to decide wheter..." -inputs = { input_data = "CategoryInput" } -output = "native.Text" -expression = "input_data.category" - -[pipe.conditional_operation.pipe_map] -small = "process_small" -medium = "process_medium" -large = "process_large" -``` -or -```plx -[pipe.conditional_operation] -type = "PipeCondition" -description = "A conditonal pipe to decide wheter..." -inputs = { input_data = "CategoryInput" } -output = "native.Text" -expression_template = "{{ input_data.category }}" # Jinja2 code - -[pipe.conditional_operation.pipe_map] -small = "process_small" -medium = "process_medium" -large = "process_large" -``` - -## Key Parameters - -- `expression`: Direct boolean or string expression (mutually exclusive with expression_template) -- `expression_template`: Jinja2 template for more complex conditional logic (mutually exclusive with expression) -- `pipe_map`: Dictionary mapping expression results to pipe codes : -1 - The key on the left (`small`, `medium`) is the result of `expression` or `expression_template`. -2 - The value on the right (`process_small`, `process_medium`, ..) is the name of the pipce to trigger - -# PipeBatch Controller - -The PipeBatch controller allows you to apply a pipe operation to each element in a list of inputs in parallele. It is created via a PipeSequence. - -## Usage in PLX Configuration - -```plx -[pipe.sequence_with_batch] -type = "PipeSequence" -description = "A Sequence of pipes" -inputs = { input_data = "ConceptName" } -output = "OutputConceptName" -steps = [ - { pipe = "pipe_to_apply", batch_over = "input_list", batch_as = "current_item", result = "batch_results" } -] -``` - -## Key Parameters - -- `pipe`: The pipe operation to apply to each element in the batch -- `batch_over`: The name of the list in the context to iterate over -- `batch_as`: The name to use for the current element in the pipe's context -- `result`: Where to store the results of the batch operation - -# PipeLLM Guide - -## Purpose - -PipeLLM is used to: -1. Generate text or objects with LLMs -2. Process images with Vision LLMs - -## Basic Usage - -### Simple Text Generation -```plx -[pipe.write_story] -type = "PipeLLM" -description = "Write a short story" -output = "Text" -prompt_template = """ -Write a short story about a programmer. -""" -``` - -### Structured Data Extraction -```plx -[pipe.extract_info] -type = "PipeLLM" -description = "Extract information" -inputs = { text = "Text" } -output = "PersonInfo" -prompt_template = """ -Extract person information from this text: -@text -""" -``` - -### System Prompts -Add system-level instructions: -```plx -[pipe.expert_analysis] -type = "PipeLLM" -description = "Expert analysis" -output = "Analysis" -system_prompt = "You are a data analysis expert" -prompt_template = "Analyze this data" -``` - -### Multiple Outputs -Generate multiple results: -```plx -[pipe.generate_ideas] -type = "PipeLLM" -description = "Generate ideas" -output = "Idea" -nb_output = 3 # Generate exactly 3 ideas -# OR -multiple_output = true # Let the LLM decide how many to generate -``` - -### Vision Tasks -Process images with VLMs: -```plx -[pipe.analyze_image] -type = "PipeLLM" -description = "Analyze image" -inputs = { image = "Image" } # `image` is the name of the stuff that contains the Image. If its in a stuff, you can add something like `{ "page.image": "Image" } -output = "ImageAnalysis" -prompt_template = "Describe what you see in this image" -``` - -# PipeOCR Guide - -## Purpose - -Extract text and images from an image or a PDF - -## Basic Usage - -### Simple Text Generation -```plx -[pipe.extract_info] -type = "PipeOcr" -description = "extract the information" -inputs = { ocr_input = "PDF" } # or { ocr_input = "Image" } if its an image. This is the only input -output = "Page" -``` - -The input ALWAYS HAS TO BE `ocr_input` and the value is either of concept `Image` or `Pdf`. - -The output concept `Page` is a native concept, with the structure `PageContent`: -It corresponds to 1 page. Therefore, the PipeOcr is outputing a `ListContent` of `Page` - -```python -class TextAndImagesContent(StuffContent): - text: Optional[TextContent] - images: Optional[List[ImageContent]] - -class PageContent(StructuredContent): # CONCEPT IS "Page" - text_and_images: TextAndImagesContent - page_view: Optional[ImageContent] = None -``` -- `text_and_images` are the text, and the related images found in the input image or PDF. -- `page_view` is the screenshot of the whole pdf page/image. - -This rule explains how to write prompt templates in PipeLLM definitions. - -## Insert stuff inside a tagged block - -If the inserted text is supposedly long text, made of several lines or paragraphs, you want it inserted inside a block, possibly a block tagged and delimlited with proper syntax as one would do in a markdown documentation. To include stuff as a block, use the "@" prefix. - -Example template: -```plx -prompt_template = """ -Match the expense with its corresponding invoice: - -@expense - -@invoices -""" -``` -In this example, the expense data and the invoices data are obviously made of several lines each, that's why it makes sense to use the "@" prefix in order to have them delimited inside a block. Note that our preprocessor will automatically include the block's title, so it doens't need to be explictly written in the prompt template. - -**DO NOT write things like "Here is the expense: @expense".** -**DO write simply "@expense" alone in an isolated line.** - -## Insert stuff inline - -If the inserted text is short text and it makes sense to have it inserted directly into a sentence, you want it inserted inline. To insert stuff inline, use the "$" prefix. This will insert the stuff without delimiters and the content will be rendered as plain text. - -Example template: -```plx -prompt_template = """ -Your goal is to summarize everything related to $topic in the provided text: - -@text - -Please provide only the summary, with no additional text or explanations. -Your summary should not be longer than 2 sentences. -""" -``` - -Here, $topic will be inserted inline, whereas @text will be a a delimited block. -Be sure to make the proper choice of prefix for each insertion. - -**DO NOT write "$topic" alone in an isolated line.** -**DO write things like "Write an essay about $topic" included in an actual sentence.** - -# Example to execute a pipeline - -```python -import asyncio - -from pipelex import pretty_print -from pipelex.hub import get_pipeline_tracker, get_report_delegate -from pipelex.pipelex import Pipelex -from pipelex.pipeline.execute import execute_pipeline - -from cocode.pipelines.examples.extract_gantt.gantt import GanttChart - -SAMPLE_NAME = "extract_gantt" -IMAGE_URL = "assets/gantt/gantt_tree_house.png" - - -async def extract_gantt(image_url: str) -> GanttChart: - # Run the pipe - pipe_output = await execute_pipeline( - pipe_code="extract_gantt_by_steps", - inputs={ - "gantt_chart_image": { - "concept": "gantt.GanttImage", - "content": ImageContent(url=image_url), - } - }, - ) - # Output the result - return pipe_output.main_stuff_as(content_type=GanttChart) - - -# start Pipelex -Pipelex.make() - -# run sample using asyncio -gantt_chart = asyncio.run(extract_gantt(IMAGE_URL)) - -# Display cost report (tokens used and cost) -get_report_delegate().generate_report() -# output results -pretty_print(gantt_chart, title="Gantt Chart") -get_pipeline_tracker().output_flowchart() -``` - -The input memory is a dictionary of key-value pairs, where the key is the name of the input variable and the value provides details to make it a stuff object. The relevant definitions are: -```python -StuffContentOrData = Dict[str, Any] | StuffContent | List[Any] | str -ImplicitMemory = Dict[str, StuffContentOrData] -``` -As you can seen, we made it so different ways can be used to define that stuff using structured content or data. - -So here are a few concrete examples of calls to execute_pipeline with various ways to set up the input memory: - -```python -# Here we have a single input and it's a Text. -# If you assign a string, by default it will be considered as a TextContent. - pipe_output = await execute_pipeline( - pipe_code="master_advisory_orchestrator", - inputs={ - "user_input": problem_description, - }, - ) - -# Here we have a single input and it's a PDF. -# Because PDFContent is a native concept, we can use it directly as a value, -# the system knows what content it corresponds to: - pipe_output = await execute_pipeline( - pipe_code="power_extractor_dpe", - inputs={ - "ocr_input": PDFContent(url=pdf_url), - }, - ) - -# Here we have a single input and it's an Image. -# Because ImageContent is a native concept, we can use it directly as a value: - pipe_output = await execute_pipeline( - pipe_code="fashion_variation_pipeline", - inputs={ - "fashion_photo": ImageContent(url=image_url), - }, - ) - -# Here we have a single input, it's an image but -# its actually a more specific concept gantt.GanttImage which refines Image, -# so we must provide it using a dict with the concept and the content: - pipe_output = await execute_pipeline( - pipe_code="extract_gantt_by_steps", - inputs={ - "gantt_chart_image": { - "concept": "gantt.GanttImage", - "content": ImageContent(url=image_url), - } - }, - ) - -# Here is a more complex example with multiple inputs assigned using different ways: - pipe_output = await execute_pipeline( - pipe_code="retrieve_then_answer", - dynamic_output_concept_code="contracts.Fees", - inputs={ - "text": load_text_from_path(path=text_path), - "question": { - "concept": "answer.Question", - "content": question, - }, - "client_instructions": client_instructions, - }, - ) -``` - -ALWAYS RUN `make validate` when you are finished writing pipelines: This checks for errors. If there are errors, iterate until it works. -Then, create an example file to run the pipeline in the `examples` folder. -But don't write documentation unless asked explicitly to. - -# Rules to choose LLM models used in PipeLLMs. - -## LLM Handles - -In order to use it in a pipe, an LLM is referenced by its llm_handle and possibly by an llm_preset. -Both llm_handles and llm_presets are defined in this toml config file: [base_llm_deck.toml](./cocode/pipelex_libraries/llm_deck/base_llm_deck.toml) - -## LLM Handles - -An llm_handle matches the handle (an id of sorts) with the full specification of the LLM to use, i.e.: -- llm_name -- llm_version -- llm_platform_choice - -The declaration of llm_handles looks like this in toml syntax: -```toml -[llm_handles] -gpt-4o-2024-11-20 = { llm_name = "gpt-4o", llm_version = "2024-11-20" } -``` - -In mosty cases, we only want to use version "latest" and llm_platform_choice "default" in which case the declaration is simply a match of the llm_handle to the llm_name, like this: -```toml -best-claude = "claude-4-opus" -best-gemini = "gemini-2.5-pro" -best-mistral = "mistral-large" -``` - -And of course, llm_handles are automatically assigned for all models by their name, with version "latest" and llm_platform_choice "default". - -## Using an LLM Handle in a PipeLLM - -Here is an example of using an llm_handle to specify which LLM to use in a PipeLLM: - -```plx -[pipe.hello_world] -type = "PipeLLM" -description = "Write text about Hello World." -output = "Text" -llm = { llm_handle = "gpt-4o-mini", temperature = 0.9, max_tokens = "auto" } -prompt_template = """ -Write a haiku about Hello World. -""" -``` - -As you can see, to use the LLM, you must also indicate the temperature (float between 0 and 1) and max_tokens (either an int or the string "auto"). - -## LLM Presets - -Presets are meant to record the choice of an llm with its hyper parameters (temperature and max_tokens) if it's good for a particular task. LLM Presets are skill-oriented. - -Examples: -```toml -llm_to_reason = { llm_handle = "o4-mini", temperature = 1, max_tokens = "auto" } -llm_to_extract_invoice = { llm_handle = "claude-3-7-sonnet", temperature = 0.1, max_tokens = "auto" } -``` - -The interest is that these presets can be used to set the LLM choice in a PipeLLM, like this: - -```plx -[pipe.extract_invoice] -type = "PipeLLM" -description = "Extract invoice information from an invoice text transcript" -inputs = { invoice_text = "InvoiceText" } -output = "Invoice" -llm = "llm_to_extract_invoice" -prompt_template = """ -Extract invoice information from this invoice: - -The category of this invoice is: $invoice_details.category. - -@invoice_text -""" -``` - -The setting here `llm = "llm_to_extract_invoice"` works because "llm_to_extract_invoice" has been declared as an llm_preset in the deck. -You must not use an LLM preset in a PipeLLM that does not exist in the deck. If needed, you can add llm presets. - - -You can override the predefined llm presets in [overrides.toml](./cocode/pipelex_libraries/llm_deck/overrides.toml). - -These rules apply when writing unit tests. -- Always use pytest - -## Test file structure - -- Name test files with `test_` prefix -- Use descriptive names that match the functionality being tested -- Place test files in the appropriate test category directory: - - `tests/unit/` - for unit tests that test individual functions/classes in isolation - - `tests/integration/` - for integration tests that test component interactions - - `tests/e2e/` - for end-to-end tests that test complete workflows - - `tests/test_pipelines/` - for test pipeline definitions (PLX files and their structuring python files) -- Fixtures are defined in conftest.py modules at different levels of the hierarchy, their scope is handled by pytest -- Test data is placed inside test_data.py at different levels of the hierarchy, they must be imported with package paths from the root like `tests.pipelex.test_data`. Their content is all constants, regrouped inside classes to keep things tidy. -- Always put test inside Test classes. -- The pipelex pipelines should be stored in `tests/test_pipelines` as well as the related structured Output classes that inherit from `StructuredContent` - -## Markers - -Apply the appropriate markers: -- "llm: uses an LLM to generate text or objects" -- "imgg: uses an image generation AI" -- "inference: uses either an LLM or an image generation AI" -- "gha_disabled: will not be able to run properly on GitHub Actions" - -Several markers may be applied. For instance, if the test uses an LLM, then it uses inference, so you must mark with both `inference`and `llm`. - -## Tips - -- Never use the unittest.mock. Use pytest-mock - -## Test Class Structure - -Always group the tests of a module into a test class: - -```python -@pytest.mark.llm -@pytest.mark.inference -@pytest.mark.asyncio(loop_scope="class") -class TestFooBar: - @pytest.mark.parametrize( - "topic test_case_blueprint", - [ - TestCases.CASE_1, - TestCases.CASE_2, - ], - ) - async def test_pipe_processing( - self, - request: FixtureRequest, - topic: str, - test_case_blueprint: StuffBlueprint, - ): - # Test implementation -``` - -Sometimes it can be convenient to access the test's name in its body, for instance to include into a job_id. To achieve that, add the argument `request: FixtureRequest` into the signature and then you can get the test name using `cast(str, request.node.originalname), # type: ignore`. - -# Pipe tests - -## Required imports for pipe tests - -```python -import pytest -from pytest import FixtureRequest -from pipelex import log, pretty_print -from pipelex.core.stuffs.stuff_factory import StuffBlueprint, StuffFactory -from pipelex.core.memory.working_memory_factory import WorkingMemoryFactory -from pipelex.hub import get_report_delegate -from cocode.pipelines.base_library.retrieve import RetrievedExcerpt -from pipelex.config_pipelex import get_config - -from pipelex.core.pipe import PipeAbstract, update_job_metadata_for_pipe -from pipelex.core.pipes.pipe_output import PipeOutput, PipeOutputType -from pipelex.core.pipes.pipe_run_params import PipeRunParams -from pipelex.core.pipes.pipe_run_params import PipeRunParams -from pipelex.pipe_works.pipe_router_protocol import PipeRouterProtocol -``` - -## Pipe test implementation steps - -1. Create Stuff from blueprint: - -```python -stuff = StuffFactory.make_stuff( - concept_code="RetrievedExcerpt", - domain="retrieve", - content=RetrievedExcerpt(text="", justification="") - name="retrieved_text", -) -``` - -2. Create Working Memory: - -```python -working_memory = WorkingMemoryFactory.make_from_single_stuff(stuff=stuff) -``` - -3. Run the pipe: - -```python -pipe_output: PipeOutput = await pipe_router.run_pipe( - pipe_code="pipe_name", - pipe_run_params=PipeRunParamsFactory.make_run_params(), - working_memory=working_memory, - job_metadata=JobMetadata(), -) -``` - -4. Log output and generate report: - -```python -pretty_print(pipe_output, title=f"Pipe output") -get_report_delegate().generate_report() -``` - -5. Basic assertions: - -```python -assert pipe_output is not None -assert pipe_output.working_memory is not None -assert pipe_output.main_stuff is not None -``` - -## Test Data Organization - -- If it's not already there, create a `test_data.py` file in the test directory -- Define test cases using `StuffBlueprint`: - -```python -class TestCases: - CASE_BLUEPRINT_1 = StuffBlueprint( - name="test_case_1", - concept_code="domain.ConceptName1", - value="test_value" - ) - CASE_BLUEPRINT_2 = StuffBlueprint( - name="test_case_2", - concept_code="domain.ConceptName2", - value="test_value" - ) - - CASE_BLUEPRINTS: ClassVar[List[Tuple[str, str]]] = [ # topic, blueprint" - ("topic1", CASE_BLUEPRINT_1), - ("topic2", CASE_BLUEPRINT_2), - ] -``` - -Note how we avoid initializing a default mutable value within a class instance, instead we use ClassVar. -Also note that we provide a topic for the test case, which is purely for convenience. - -## Best Practices for Testing - -- Use parametrize for multiple test cases -- Test both success and failure cases -- Verify working memory state -- Check output structure and content -- Use meaningful test case names -- Include docstrings explaining test purpose -- Log outputs for debugging -- Generate reports for cost tracking - -# Test-Driven Development Guide - -This document outlines our test-driven development (TDD) process and the tools available for testing. - -## TDD Cycle - -1. **Write a Test First** -[pytest.mdc](pytest.mdc) - -2. **Write the Code** - - Implement the minimum amount of code needed to pass the test - - Follow the project's coding standards - - Keep it simple - don't write more than needed - -3. **Run Linting and Type Checking** -[coding_standards.mdc](coding_standards.mdc) - -4. **Refactor if needed** -If the code needs refactoring, with the best practices [coding_standards.mdc](coding_standards.mdc) - -5. **Validate tests** - -Remember: The key to TDD is writing the test first and letting it drive your implementation. Always run the full test suite and quality checks before considering a feature complete. - +# Pipelex Coding Rules + ## Guide to write or edit pipelines using the Pipelex language in .plx files - Always first write your "plan" in natural language, then transcribe it in pipelex. -- You should ALWAYS RUN the terminal command `make validate` when you are writing or editing a `.plx` file. It will ensure the pipe is runnable. If not, iterate. +- You should ALWAYS RUN validation when you are writing or editing a `.plx` file. It will ensure the pipe is runnable. If not, iterate. + - For a specific file: `pipelex validate path_to_file.plx` + - For all pipelines: `pipelex validate all` + - **IMPORTANT**: Ensure the Python virtual environment is activated before running `pipelex` commands. For standard installations, the venv is named `.venv` - always check that first. The commands will not work without proper venv activation. - Please use POSIX standard for files. (empty lines, no trailing whitespaces, etc.) ### Pipeline File Naming @@ -944,10 +23,10 @@ A pipeline file has three main sections: #### Domain Statement ```plx -domain = "domain_name" +domain = "domain_code" description = "Description of the domain" # Optional ``` -Note: The domain name usually matches the plx filename for single-file domains. For multi-file domains, use the subdirectory name. +Note: The domain code usually matches the plx filename for single-file domains. For multi-file domains, use the subdirectory name. #### Concept Definitions @@ -962,10 +41,10 @@ ConceptName = "Description of the concept" - Use PascalCase for concept names - Never use plurals (no "Stories", use "Story") - lists are handled implicitly by Pipelex - Avoid circumstantial adjectives (no "LargeText", use "Text") - focus on the essence of what the concept represents -- Don't redefine native concepts (Text, Image, PDF, TextAndImages, Number, Page) +- Don't redefine native concepts (Text, Image, PDF, TextAndImages, Number, Page, JSON) **Native Concepts:** -Pipelex provides built-in native concepts: `Text`, `Image`, `PDF`, `TextAndImages`, `Number`, `Page`. Use these directly or refine them when appropriate. +Pipelex provides built-in native concepts: `Text`, `Image`, `PDF`, `TextAndImages`, `Number`, `Page`, `JSON`. Use these directly or refine them when appropriate. **Refining Native Concepts:** To create a concept that specializes a native concept without adding fields: @@ -983,7 +62,7 @@ For details on how to structure concepts with fields, see the "Structuring Model ### Pipe Base Definition ```plx -[pipe.your_pipe_name] +[pipe.your_pipe_code] type = "PipeLLM" description = "A description of what your pipe does" inputs = { input_1 = "ConceptName1", input_2 = "ConceptName2" } @@ -993,7 +72,7 @@ output = "ConceptName" The pipes will all have at least this base definition. - `inputs`: Dictionary of key being the variable used in the prompts, and the value being the ConceptName. It should ALSO LIST THE INPUTS OF THE INTERMEDIATE STEPS (if PipeSequence) or of the conditional pipes (if PipeCondition). So If you have this error: -`StaticValidationError: missing_input_variable • domain='expense_validator' • pipe='validate_expense' • +`PipeValidationError: missing_input_variable • domain='expense_validator' • pipe='validate_expense' • variable='['invoice']'`` That means that the pipe validate_expense is missing the input `invoice` because one of the subpipe is needing it. @@ -1048,16 +127,16 @@ For concepts with structured fields, define them inline using TOML syntax: description = "A commercial document issued by a seller to a buyer" [concept.Invoice.structure] -invoice_number = "The unique invoice identifier" +invoice_number = "The unique invoice identifier" # This will be optional by default issue_date = { type = "date", description = "The date the invoice was issued", required = true } total_amount = { type = "number", description = "The total invoice amount", required = true } -vendor_name = "The name of the vendor" -line_items = { type = "list", item_type = "text", description = "List of items", required = false } +vendor_name = "The name of the vendor" # This will be optional by default +line_items = { type = "list", item_type = "text", description = "List of items" } ``` **Supported inline field types:** `text`, `integer`, `boolean`, `number`, `date`, `list`, `dict` -**Field properties:** `type`, `description`, `required` (default: true), `default_value`, `choices`, `item_type` (for lists), `key_type` and `value_type` (for dicts) +**Field properties:** `type`, `description`, `required` (default: false), `default_value`, `choices`, `item_type` (for lists), `key_type` and `value_type` (for dicts) **Simple syntax** (creates required text field): ```plx @@ -1066,7 +145,7 @@ field_name = "Field description" **Detailed syntax** (with explicit properties): ```plx -field_name = { type = "text", description = "Field description", required = false, default_value = "default" } +field_name = { type = "text", description = "Field description", default_value = "default" } ``` **3. Python StructuredContent Class (For Advanced Features)** @@ -1392,7 +471,7 @@ The PipeExtract operator is used to extract text and images from an image or a P [pipe.extract_info] type = "PipeExtract" description = "extract the information" -inputs = { document = "PDF" } # or { image = "Image" } if it's an image. This is the only input. +inputs = { document = "Document" } # or { image = "Image" } if it's an image. This is the only input. output = "Page" ``` @@ -1401,7 +480,7 @@ Using Extract Model Settings: [pipe.extract_with_model] type = "PipeExtract" description = "Extract with specific model" -inputs = { document = "PDF" } +inputs = { document = "Document" } output = "Page" model = "base_extract_mistral" # Use predefined extract preset or model alias ``` @@ -1509,15 +588,16 @@ $sales_rep.phone | $sales_rep.email """ ``` -#### Key Parameters +#### Key Parameters (Template Mode) -- `template`: Inline template string (mutually exclusive with template_name) +- `template`: Inline template string (mutually exclusive with template_name and construct) - `template_name`: Name of a predefined template (mutually exclusive with template) - `template_category`: Template type ("llm_prompt", "html", "markdown", "mermaid", etc.) - `templating_style`: Styling options for template rendering - `extra_context`: Additional context variables for template For more control, you can use a nested `template` section instead of the `template` field: + - `template.template`: The template string - `template.category`: Template type - `template.templating_style`: Styling options @@ -1525,9 +605,143 @@ For more control, you can use a nested `template` section instead of the `templa #### Template Variables Use the same variable insertion rules as PipeLLM: + - `@variable` for block insertion (multi-line content) - `$variable` for inline insertion (short text) +#### Construct Mode (for StructuredContent Output) + +PipeCompose can also generate `StructuredContent` objects using the `construct` section. This mode composes field values from fixed values, variable references, templates, or nested structures. + +**When to use construct mode:** + +- You need to output a structured object (not just Text) +- You want to deterministically compose fields from existing data +- No LLM is needed - just data composition and templating + +##### Basic Construct Usage + +```plx +[concept.SalesSummary] +description = "A structured sales summary" + +[concept.SalesSummary.structure] +report_title = { type = "text", description = "Title of the report" } +customer_name = { type = "text", description = "Customer name" } +deal_value = { type = "number", description = "Deal value" } +summary_text = { type = "text", description = "Generated summary text" } + +[pipe.compose_summary] +type = "PipeCompose" +description = "Compose a sales summary from deal data" +inputs = { deal = "Deal" } +output = "SalesSummary" + +[pipe.compose_summary.construct] +report_title = "Monthly Sales Report" +customer_name = { from = "deal.customer_name" } +deal_value = { from = "deal.amount" } +summary_text = { template = "Deal worth $deal.amount with $deal.customer_name" } +``` + +##### Field Composition Methods + +There are four ways to define field values in a construct: + +**1. Fixed Value (literal)** + +Use a literal value directly: + +```plx +[pipe.compose_report.construct] +report_title = "Annual Report" +report_year = 2024 +is_draft = false +``` + +**2. Variable Reference (`from`)** + +Get a value from working memory using a dotted path: + +```plx +[pipe.compose_report.construct] +customer_name = { from = "deal.customer_name" } +total_amount = { from = "order.total" } +street_address = { from = "customer.address.street" } +``` + +**3. Template (`template`)** + +Render a Jinja2 template with variable substitution: + +```plx +[pipe.compose_report.construct] +invoice_number = { template = "INV-$order.id" } +summary = { template = "Deal worth $deal.amount with $deal.customer_name on {{ current_date }}" } +``` + +**4. Nested Construct** + +For nested structures, use a TOML subsection: + +```plx +[pipe.compose_invoice.construct] +invoice_number = { template = "INV-$order.id" } +total = { from = "order.total_amount" } + +[pipe.compose_invoice.construct.billing_address] +street = { from = "customer.address.street" } +city = { from = "customer.address.city" } +country = "France" +``` + +##### Complete Construct Example + +```plx +domain = "invoicing" + +[concept.Address] +description = "A postal address" + +[concept.Address.structure] +street = { type = "text", description = "Street address" } +city = { type = "text", description = "City name" } +country = { type = "text", description = "Country name" } + +[concept.Invoice] +description = "An invoice document" + +[concept.Invoice.structure] +invoice_number = { type = "text", description = "Invoice number" } +total = { type = "number", description = "Total amount" } + +[pipe.compose_invoice] +type = "PipeCompose" +description = "Compose an invoice from order and customer data" +inputs = { order = "Order", customer = "Customer" } +output = "Invoice" + +[pipe.compose_invoice.construct] +invoice_number = { template = "INV-$order.id" } +total = { from = "order.total_amount" } + +[pipe.compose_invoice.construct.billing_address] +street = { from = "customer.address.street" } +city = { from = "customer.address.city" } +country = "France" +``` + +##### Key Parameters (Construct Mode) + +- `construct`: Dictionary mapping field names to their composition rules +- Each field can be: + - A literal value (string, number, boolean) + - A dict with `from` key for variable reference + - A dict with `template` key for template rendering + - A nested dict for nested structures + +**Note:** You must use either `template` or `construct`, not both. They are mutually exclusive. + ### PipeImgGen operator The PipeImgGen operator is used to generate images using AI image generation models. @@ -1741,7 +955,7 @@ Presets are meant to record the choice of an llm with its hyper parameters (temp Examples: ```toml -llm_for_complex_reasoning = { model = "base-claude", temperature = 1 } +llm_to_engineer = { model = "base-claude", temperature = 1 } llm_to_extract_invoice = { model = "claude-3-7-sonnet", temperature = 0.1, max_tokens = "auto" } ``` @@ -1770,7 +984,11 @@ You can override the predefined llm presets by setting them in `.pipelex/inferen --- -ALWAYS RUN `make validate` when you are finished writing pipelines: This checks for errors. If there are errors, iterate until it works. +ALWAYS RUN validation when you are finished writing pipelines: This checks for errors. If there are errors, iterate until it works. +- For a specific bundle/file: `pipelex validate path_to_file.plx` +- For all pipelines: `pipelex validate all` +- Remember: Ensure your Python virtual environment is activated (typically `.venv` for standard installations) before running `pipelex` commands. + Then, create an example file to run the pipeline in the `examples` folder. But don't write documentation unless asked explicitly to. @@ -1825,7 +1043,7 @@ async def extract_gantt(image_url: str) -> GanttChart: # Run the pipe pipe_output = await execute_pipeline( pipe_code="extract_gantt_by_steps", - input_memory={ + inputs={ "gantt_chart_image": { "concept": "gantt.GanttImage", "content": ImageContent(url=image_url), @@ -1864,18 +1082,18 @@ So here are a few concrete examples of calls to execute_pipeline with various wa ## If you assign a string, by default it will be considered as a TextContent. pipe_output = await execute_pipeline( pipe_code="master_advisory_orchestrator", - input_memory={ + inputs={ "user_input": problem_description, }, ) -## Here we have a single input and it's a PDF. -## Because PDFContent is a native concept, we can use it directly as a value, +## Here we have a single input and it's a document. +## Because DocumentContent is a native concept, we can use it directly as a value, ## the system knows what content it corresponds to: pipe_output = await execute_pipeline( pipe_code="power_extractor_dpe", - input_memory={ - "document": PDFContent(url=pdf_url), + inputs={ + "document": DocumentContent(url=pdf_url), }, ) @@ -1883,7 +1101,7 @@ So here are a few concrete examples of calls to execute_pipeline with various wa ## Because ImageContent is a native concept, we can use it directly as a value: pipe_output = await execute_pipeline( pipe_code="fashion_variation_pipeline", - input_memory={ + inputs={ "fashion_photo": ImageContent(url=image_url), }, ) @@ -1893,7 +1111,7 @@ So here are a few concrete examples of calls to execute_pipeline with various wa ## so we must provide it using a dict with the concept and the content: pipe_output = await execute_pipeline( pipe_code="extract_gantt_by_steps", - input_memory={ + inputs={ "gantt_chart_image": { "concept": "gantt.GanttImage", "content": ImageContent(url=image_url), @@ -1905,7 +1123,7 @@ So here are a few concrete examples of calls to execute_pipeline with various wa pipe_output = await execute_pipeline( pipe_code="retrieve_then_answer", dynamic_output_concept_code="contracts.Fees", - input_memory={ + inputs={ "text": load_text_from_path(path=text_path), "question": { "concept": "answer.Question", @@ -1998,82 +1216,4 @@ result_list = pipe_output.main_stuff_as_items(item_type=GanttChart) ``` --- - -## Rules to choose LLM models used in PipeLLMs. - -### LLM Configuration System - -In order to use it in a pipe, an LLM is referenced by its llm_handle (alias) and possibly by an llm_preset. -LLM configurations are managed through the new inference backend system with files located in `.pipelex/inference/`: - -- **Model Deck**: `.pipelex/inference/deck/base_deck.toml` and `.pipelex/inference/deck/overrides.toml` -- **Backends**: `.pipelex/inference/backends.toml` and `.pipelex/inference/backends/*.toml` -- **Routing**: `.pipelex/inference/routing_profiles.toml` - -### LLM Handles - -An llm_handle can be either: -1. **A direct model name** (like "gpt-4o-mini", "claude-3-sonnet") - automatically available for all models loaded by the inference backend system -2. **An alias** - user-defined shortcuts that map to model names, defined in the `[aliases]` section: - -```toml -[aliases] -base-claude = "claude-4.5-sonnet" -base-gpt = "gpt-5" -base-gemini = "gemini-2.5-flash" -base-mistral = "mistral-medium" -``` - -The system first looks for direct model names, then checks aliases if no direct match is found. The system handles model routing through backends automatically. - -### Using an LLM Handle in a PipeLLM - -Here is an example of using an llm_handle to specify which LLM to use in a PipeLLM: - -```plx -[pipe.hello_world] -type = "PipeLLM" -description = "Write text about Hello World." -output = "Text" -model = { model = "gpt-5", temperature = 0.9 } -prompt = """ -Write a haiku about Hello World. -""" -``` - -As you can see, to use the LLM, you must also indicate the temperature (float between 0 and 1) and max_tokens (either an int or the string "auto"). - -### LLM Presets - -Presets are meant to record the choice of an llm with its hyper parameters (temperature and max_tokens) if it's good for a particular task. LLM Presets are skill-oriented. - -Examples: -```toml -llm_for_complex_reasoning = { model = "base-claude", temperature = 1 } -llm_to_extract_invoice = { model = "claude-3-7-sonnet", temperature = 0.1, max_tokens = "auto" } -``` - -The interest is that these presets can be used to set the LLM choice in a PipeLLM, like this: - -```plx -[pipe.extract_invoice] -type = "PipeLLM" -description = "Extract invoice information from an invoice text transcript" -inputs = { invoice_text = "InvoiceText" } -output = "Invoice" -model = "llm_to_extract_invoice" -prompt = """ -Extract invoice information from this invoice: - -The category of this invoice is: $invoice_details.category. - -@invoice_text -""" -``` - -The setting here `model = "llm_to_extract_invoice"` works because "llm_to_extract_invoice" has been declared as an llm_preset in the deck. -You must not use an LLM preset in a PipeLLM that does not exist in the deck. If needed, you can add llm presets. - - -You can override the predefined llm presets by setting them in `.pipelex/inference/deck/overrides.toml`. diff --git a/.windsurfrules.md b/.windsurfrules.md index f7f417f..af4572e 100644 --- a/.windsurfrules.md +++ b/.windsurfrules.md @@ -1,10 +1,13 @@ -# Pipelex Rules - +# Pipelex Coding Rules + ## Guide to write or edit pipelines using the Pipelex language in .plx files - Always first write your "plan" in natural language, then transcribe it in pipelex. -- You should ALWAYS RUN the terminal command `make validate` when you are writing or editing a `.plx` file. It will ensure the pipe is runnable. If not, iterate. +- You should ALWAYS RUN validation when you are writing or editing a `.plx` file. It will ensure the pipe is runnable. If not, iterate. + - For a specific file: `pipelex validate path_to_file.plx` + - For all pipelines: `pipelex validate all` + - **IMPORTANT**: Ensure the Python virtual environment is activated before running `pipelex` commands. For standard installations, the venv is named `.venv` - always check that first. The commands will not work without proper venv activation. - Please use POSIX standard for files. (empty lines, no trailing whitespaces, etc.) ### Pipeline File Naming @@ -20,10 +23,10 @@ A pipeline file has three main sections: #### Domain Statement ```plx -domain = "domain_name" +domain = "domain_code" description = "Description of the domain" # Optional ``` -Note: The domain name usually matches the plx filename for single-file domains. For multi-file domains, use the subdirectory name. +Note: The domain code usually matches the plx filename for single-file domains. For multi-file domains, use the subdirectory name. #### Concept Definitions @@ -38,10 +41,10 @@ ConceptName = "Description of the concept" - Use PascalCase for concept names - Never use plurals (no "Stories", use "Story") - lists are handled implicitly by Pipelex - Avoid circumstantial adjectives (no "LargeText", use "Text") - focus on the essence of what the concept represents -- Don't redefine native concepts (Text, Image, PDF, TextAndImages, Number, Page) +- Don't redefine native concepts (Text, Image, PDF, TextAndImages, Number, Page, JSON) **Native Concepts:** -Pipelex provides built-in native concepts: `Text`, `Image`, `PDF`, `TextAndImages`, `Number`, `Page`. Use these directly or refine them when appropriate. +Pipelex provides built-in native concepts: `Text`, `Image`, `PDF`, `TextAndImages`, `Number`, `Page`, `JSON`. Use these directly or refine them when appropriate. **Refining Native Concepts:** To create a concept that specializes a native concept without adding fields: @@ -59,7 +62,7 @@ For details on how to structure concepts with fields, see the "Structuring Model ### Pipe Base Definition ```plx -[pipe.your_pipe_name] +[pipe.your_pipe_code] type = "PipeLLM" description = "A description of what your pipe does" inputs = { input_1 = "ConceptName1", input_2 = "ConceptName2" } @@ -69,7 +72,7 @@ output = "ConceptName" The pipes will all have at least this base definition. - `inputs`: Dictionary of key being the variable used in the prompts, and the value being the ConceptName. It should ALSO LIST THE INPUTS OF THE INTERMEDIATE STEPS (if PipeSequence) or of the conditional pipes (if PipeCondition). So If you have this error: -`StaticValidationError: missing_input_variable • domain='expense_validator' • pipe='validate_expense' • +`PipeValidationError: missing_input_variable • domain='expense_validator' • pipe='validate_expense' • variable='['invoice']'`` That means that the pipe validate_expense is missing the input `invoice` because one of the subpipe is needing it. @@ -124,16 +127,16 @@ For concepts with structured fields, define them inline using TOML syntax: description = "A commercial document issued by a seller to a buyer" [concept.Invoice.structure] -invoice_number = "The unique invoice identifier" +invoice_number = "The unique invoice identifier" # This will be optional by default issue_date = { type = "date", description = "The date the invoice was issued", required = true } total_amount = { type = "number", description = "The total invoice amount", required = true } -vendor_name = "The name of the vendor" -line_items = { type = "list", item_type = "text", description = "List of items", required = false } +vendor_name = "The name of the vendor" # This will be optional by default +line_items = { type = "list", item_type = "text", description = "List of items" } ``` **Supported inline field types:** `text`, `integer`, `boolean`, `number`, `date`, `list`, `dict` -**Field properties:** `type`, `description`, `required` (default: true), `default_value`, `choices`, `item_type` (for lists), `key_type` and `value_type` (for dicts) +**Field properties:** `type`, `description`, `required` (default: false), `default_value`, `choices`, `item_type` (for lists), `key_type` and `value_type` (for dicts) **Simple syntax** (creates required text field): ```plx @@ -142,7 +145,7 @@ field_name = "Field description" **Detailed syntax** (with explicit properties): ```plx -field_name = { type = "text", description = "Field description", required = false, default_value = "default" } +field_name = { type = "text", description = "Field description", default_value = "default" } ``` **3. Python StructuredContent Class (For Advanced Features)** @@ -468,7 +471,7 @@ The PipeExtract operator is used to extract text and images from an image or a P [pipe.extract_info] type = "PipeExtract" description = "extract the information" -inputs = { document = "PDF" } # or { image = "Image" } if it's an image. This is the only input. +inputs = { document = "Document" } # or { image = "Image" } if it's an image. This is the only input. output = "Page" ``` @@ -477,7 +480,7 @@ Using Extract Model Settings: [pipe.extract_with_model] type = "PipeExtract" description = "Extract with specific model" -inputs = { document = "PDF" } +inputs = { document = "Document" } output = "Page" model = "base_extract_mistral" # Use predefined extract preset or model alias ``` @@ -585,15 +588,16 @@ $sales_rep.phone | $sales_rep.email """ ``` -#### Key Parameters +#### Key Parameters (Template Mode) -- `template`: Inline template string (mutually exclusive with template_name) +- `template`: Inline template string (mutually exclusive with template_name and construct) - `template_name`: Name of a predefined template (mutually exclusive with template) - `template_category`: Template type ("llm_prompt", "html", "markdown", "mermaid", etc.) - `templating_style`: Styling options for template rendering - `extra_context`: Additional context variables for template For more control, you can use a nested `template` section instead of the `template` field: + - `template.template`: The template string - `template.category`: Template type - `template.templating_style`: Styling options @@ -601,9 +605,143 @@ For more control, you can use a nested `template` section instead of the `templa #### Template Variables Use the same variable insertion rules as PipeLLM: + - `@variable` for block insertion (multi-line content) - `$variable` for inline insertion (short text) +#### Construct Mode (for StructuredContent Output) + +PipeCompose can also generate `StructuredContent` objects using the `construct` section. This mode composes field values from fixed values, variable references, templates, or nested structures. + +**When to use construct mode:** + +- You need to output a structured object (not just Text) +- You want to deterministically compose fields from existing data +- No LLM is needed - just data composition and templating + +##### Basic Construct Usage + +```plx +[concept.SalesSummary] +description = "A structured sales summary" + +[concept.SalesSummary.structure] +report_title = { type = "text", description = "Title of the report" } +customer_name = { type = "text", description = "Customer name" } +deal_value = { type = "number", description = "Deal value" } +summary_text = { type = "text", description = "Generated summary text" } + +[pipe.compose_summary] +type = "PipeCompose" +description = "Compose a sales summary from deal data" +inputs = { deal = "Deal" } +output = "SalesSummary" + +[pipe.compose_summary.construct] +report_title = "Monthly Sales Report" +customer_name = { from = "deal.customer_name" } +deal_value = { from = "deal.amount" } +summary_text = { template = "Deal worth $deal.amount with $deal.customer_name" } +``` + +##### Field Composition Methods + +There are four ways to define field values in a construct: + +**1. Fixed Value (literal)** + +Use a literal value directly: + +```plx +[pipe.compose_report.construct] +report_title = "Annual Report" +report_year = 2024 +is_draft = false +``` + +**2. Variable Reference (`from`)** + +Get a value from working memory using a dotted path: + +```plx +[pipe.compose_report.construct] +customer_name = { from = "deal.customer_name" } +total_amount = { from = "order.total" } +street_address = { from = "customer.address.street" } +``` + +**3. Template (`template`)** + +Render a Jinja2 template with variable substitution: + +```plx +[pipe.compose_report.construct] +invoice_number = { template = "INV-$order.id" } +summary = { template = "Deal worth $deal.amount with $deal.customer_name on {{ current_date }}" } +``` + +**4. Nested Construct** + +For nested structures, use a TOML subsection: + +```plx +[pipe.compose_invoice.construct] +invoice_number = { template = "INV-$order.id" } +total = { from = "order.total_amount" } + +[pipe.compose_invoice.construct.billing_address] +street = { from = "customer.address.street" } +city = { from = "customer.address.city" } +country = "France" +``` + +##### Complete Construct Example + +```plx +domain = "invoicing" + +[concept.Address] +description = "A postal address" + +[concept.Address.structure] +street = { type = "text", description = "Street address" } +city = { type = "text", description = "City name" } +country = { type = "text", description = "Country name" } + +[concept.Invoice] +description = "An invoice document" + +[concept.Invoice.structure] +invoice_number = { type = "text", description = "Invoice number" } +total = { type = "number", description = "Total amount" } + +[pipe.compose_invoice] +type = "PipeCompose" +description = "Compose an invoice from order and customer data" +inputs = { order = "Order", customer = "Customer" } +output = "Invoice" + +[pipe.compose_invoice.construct] +invoice_number = { template = "INV-$order.id" } +total = { from = "order.total_amount" } + +[pipe.compose_invoice.construct.billing_address] +street = { from = "customer.address.street" } +city = { from = "customer.address.city" } +country = "France" +``` + +##### Key Parameters (Construct Mode) + +- `construct`: Dictionary mapping field names to their composition rules +- Each field can be: + - A literal value (string, number, boolean) + - A dict with `from` key for variable reference + - A dict with `template` key for template rendering + - A nested dict for nested structures + +**Note:** You must use either `template` or `construct`, not both. They are mutually exclusive. + ### PipeImgGen operator The PipeImgGen operator is used to generate images using AI image generation models. @@ -817,7 +955,7 @@ Presets are meant to record the choice of an llm with its hyper parameters (temp Examples: ```toml -llm_for_complex_reasoning = { model = "base-claude", temperature = 1 } +llm_to_engineer = { model = "base-claude", temperature = 1 } llm_to_extract_invoice = { model = "claude-3-7-sonnet", temperature = 0.1, max_tokens = "auto" } ``` @@ -846,7 +984,11 @@ You can override the predefined llm presets by setting them in `.pipelex/inferen --- -ALWAYS RUN `make validate` when you are finished writing pipelines: This checks for errors. If there are errors, iterate until it works. +ALWAYS RUN validation when you are finished writing pipelines: This checks for errors. If there are errors, iterate until it works. +- For a specific bundle/file: `pipelex validate path_to_file.plx` +- For all pipelines: `pipelex validate all` +- Remember: Ensure your Python virtual environment is activated (typically `.venv` for standard installations) before running `pipelex` commands. + Then, create an example file to run the pipeline in the `examples` folder. But don't write documentation unless asked explicitly to. @@ -901,7 +1043,7 @@ async def extract_gantt(image_url: str) -> GanttChart: # Run the pipe pipe_output = await execute_pipeline( pipe_code="extract_gantt_by_steps", - input_memory={ + inputs={ "gantt_chart_image": { "concept": "gantt.GanttImage", "content": ImageContent(url=image_url), @@ -940,18 +1082,18 @@ So here are a few concrete examples of calls to execute_pipeline with various wa ## If you assign a string, by default it will be considered as a TextContent. pipe_output = await execute_pipeline( pipe_code="master_advisory_orchestrator", - input_memory={ + inputs={ "user_input": problem_description, }, ) -## Here we have a single input and it's a PDF. -## Because PDFContent is a native concept, we can use it directly as a value, +## Here we have a single input and it's a document. +## Because DocumentContent is a native concept, we can use it directly as a value, ## the system knows what content it corresponds to: pipe_output = await execute_pipeline( pipe_code="power_extractor_dpe", - input_memory={ - "document": PDFContent(url=pdf_url), + inputs={ + "document": DocumentContent(url=pdf_url), }, ) @@ -959,7 +1101,7 @@ So here are a few concrete examples of calls to execute_pipeline with various wa ## Because ImageContent is a native concept, we can use it directly as a value: pipe_output = await execute_pipeline( pipe_code="fashion_variation_pipeline", - input_memory={ + inputs={ "fashion_photo": ImageContent(url=image_url), }, ) @@ -969,7 +1111,7 @@ So here are a few concrete examples of calls to execute_pipeline with various wa ## so we must provide it using a dict with the concept and the content: pipe_output = await execute_pipeline( pipe_code="extract_gantt_by_steps", - input_memory={ + inputs={ "gantt_chart_image": { "concept": "gantt.GanttImage", "content": ImageContent(url=image_url), @@ -981,7 +1123,7 @@ So here are a few concrete examples of calls to execute_pipeline with various wa pipe_output = await execute_pipeline( pipe_code="retrieve_then_answer", dynamic_output_concept_code="contracts.Fees", - input_memory={ + inputs={ "text": load_text_from_path(path=text_path), "question": { "concept": "answer.Question", @@ -1074,82 +1216,4 @@ result_list = pipe_output.main_stuff_as_items(item_type=GanttChart) ``` --- - -## Rules to choose LLM models used in PipeLLMs. - -### LLM Configuration System - -In order to use it in a pipe, an LLM is referenced by its llm_handle (alias) and possibly by an llm_preset. -LLM configurations are managed through the new inference backend system with files located in `.pipelex/inference/`: - -- **Model Deck**: `.pipelex/inference/deck/base_deck.toml` and `.pipelex/inference/deck/overrides.toml` -- **Backends**: `.pipelex/inference/backends.toml` and `.pipelex/inference/backends/*.toml` -- **Routing**: `.pipelex/inference/routing_profiles.toml` - -### LLM Handles - -An llm_handle can be either: -1. **A direct model name** (like "gpt-4o-mini", "claude-3-sonnet") - automatically available for all models loaded by the inference backend system -2. **An alias** - user-defined shortcuts that map to model names, defined in the `[aliases]` section: - -```toml -[aliases] -base-claude = "claude-4.5-sonnet" -base-gpt = "gpt-5" -base-gemini = "gemini-2.5-flash" -base-mistral = "mistral-medium" -``` - -The system first looks for direct model names, then checks aliases if no direct match is found. The system handles model routing through backends automatically. - -### Using an LLM Handle in a PipeLLM - -Here is an example of using an llm_handle to specify which LLM to use in a PipeLLM: - -```plx -[pipe.hello_world] -type = "PipeLLM" -description = "Write text about Hello World." -output = "Text" -model = { model = "gpt-5", temperature = 0.9 } -prompt = """ -Write a haiku about Hello World. -""" -``` - -As you can see, to use the LLM, you must also indicate the temperature (float between 0 and 1) and max_tokens (either an int or the string "auto"). - -### LLM Presets - -Presets are meant to record the choice of an llm with its hyper parameters (temperature and max_tokens) if it's good for a particular task. LLM Presets are skill-oriented. - -Examples: -```toml -llm_for_complex_reasoning = { model = "base-claude", temperature = 1 } -llm_to_extract_invoice = { model = "claude-3-7-sonnet", temperature = 0.1, max_tokens = "auto" } -``` - -The interest is that these presets can be used to set the LLM choice in a PipeLLM, like this: - -```plx -[pipe.extract_invoice] -type = "PipeLLM" -description = "Extract invoice information from an invoice text transcript" -inputs = { invoice_text = "InvoiceText" } -output = "Invoice" -model = "llm_to_extract_invoice" -prompt = """ -Extract invoice information from this invoice: - -The category of this invoice is: $invoice_details.category. - -@invoice_text -""" -``` - -The setting here `model = "llm_to_extract_invoice"` works because "llm_to_extract_invoice" has been declared as an llm_preset in the deck. -You must not use an LLM preset in a PipeLLM that does not exist in the deck. If needed, you can add llm presets. - - -You can override the predefined llm presets by setting them in `.pipelex/inference/deck/overrides.toml`. diff --git a/AGENTS.md b/AGENTS.md index 5ef7139..af4572e 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -1,934 +1,13 @@ -Concatenation -# Coding Standards & Best Practices - -This document outlines the core coding standards, best practices, and quality control procedures for the codebase. - -## Type Hints - -1. **Always Use Type Hints** - - Every function parameter must be typed - - Every function return must be typed - - Use type hints for all variables where type is not obvious - - Use types with Uppercase first letter (Dict[], List[], etc.) - -2. **StrEnum** - - Import from `pipelex.types`: - ```python - from pipelex.types import StrEnum - ``` - -## BaseModel Standards - -- Respect Pydantic v2 standards -- Keep models focused and single-purpose -- Use descriptive field names -- Use type hints for all fields -- Document complex validations -- Use Optional[] for nullable fields -- Use Field(default_factory=...) for mutable defaults - -## Factory Pattern - -- Use Factory Pattern for object creation when dealing with multiple implementations - -## Documentation - -1. **Docstring Format** - ```python - def process_image(image_path: str, size: Tuple[int, int]) -> bytes: - """Process and resize an image. - - Args: - image_path: Path to the source image - size: Tuple of (width, height) for resizing - - Returns: - Processed image as bytes - """ - pass - ``` - -2. **Class Documentation** - ```python - class ImageProcessor: - """Handles image processing operations. - - Provides methods for resizing, converting, and optimizing images. - """ - ``` - -## Error Handling - -1. **Graceful Error Handling** - - Use try/except blocks with specific exceptions - - Convert third-party exceptions to custom ones - ```python - try: - from fal_client import AsyncClient as FalAsyncClient - except ImportError as exc: - raise MissingDependencyError( - "fal-client", "fal", - "The fal-client SDK is required to use FAL models." - ) from exc - ``` - -## Code Quality Checks - -### Linting and Type Checking - -Before finalizing a task, run: -```bash -make fix-unused-imports -make check -``` - -This runs multiple code quality tools: -- Pyright: Static type checking -- Ruff: Fast Python linter -- Mypy: Static type checker - -Always fix any issues reported by these tools before proceeding. - -### Running Tests - -1. **Quick Test Run** (no LLM/image generation): - ```bash - make tp - ``` - Runs tests with markers: `(dry_runnable or not (inference or llm or imgg or ocr)) and not (needs_output or pipelex_api)` - -2. **Specific Tests**: - ```bash - make tp TEST=TestClassName - # or - make tp TEST=test_function_name - ``` - Note: Matches names starting with the provided string. - -**Important**: Never run `make ti`, `make test-inference`, `make to`, `make test-ocr`, `make tg`, or `make test-imgg` - these use costly inference. - -## Pipelines - -- All pipeline definitions go in `cocode/pipelex_libraries/pipelines/` -- Always validate pipelines after creation/edit with `make validate`. - Iterate if there are errors. - -## Project Structure - -- **Pipelines**: `cocode/pipelex_libraries/pipelines/` -- **Tests**: `tests/` directory -- **Documentation**: `docs/` directory -# Pipeline Guide - -- Always first write your "plan" in natural langage, then transcribe it in pipelex. -- You should ALWAYS RUN the terminal command `make validate` when you are writing a `.plx` file. It will ensure the pipe is runnable. If not, iterate. -- Please use POSIX standard for files. (enmpty lines, no trailing whitespaces, etc.) - -# Pipeline Structure Guide - -## Pipeline File Naming -- Files must be `.plx` for pipelines (Always add an empty line at the end of the file, and do not add trailing whitespaces to PLX files at all) -- Files must be `.py` for structures -- Use descriptive names in `snake_case` - -## Pipeline File Structure -A pipeline file has three main sections: -1. Domain statement -2. Concept definitions -3. Pipe definitions - -### Domain Statement -```plx -domain = "domain_name" -description = "Description of the domain" # Optional -``` -Note: The domain name usually matches the plx filename for single-file domains. For multi-file domains, use the subdirectory name. - -### Concept Definitions -```plx -[concept] -ConceptName = "Description of the concept" # Should be the same name as the Structure ClassName you want to output -``` - -Important Rules: -- Use PascalCase for concept names -- Never use plurals (no "Stories", use "Story") -- Avoid adjectives (no "LargeText", use "Text") -- Don't redefine native concepts (Text, Image, PDF, TextAndImages, Number) -yes -### Pipe Definitions - -## Pipe Base Structure - -```plx -[pipe.your_pipe_name] -type = "PipeLLM" -description = "A description of what your pipe does" -inputs = { input_1 = "ConceptName1", input_2 = "ConceptName2" } -output = "ConceptName" -``` - -DO NOT WRITE: -```plx -[pipe.your_pipe_name] -type = "pipe_sequence" -``` - -But it should be: - -```plx -[pipe.your_pipe_name] -type = "PipeSequence" -description = "....." -``` - -The pipes will all have at least this base structure. -- `inputs`: Dictionnary of key behing the variable used in the prompts, and the value behing the ConceptName. It should ALSO LIST THE INPUTS OF THE INTERMEDIATE STEPS (if pipeSequence) or of the conditionnal pipes (if pipeCondition). -So If you have this error: -`StaticValidationError: missing_input_variable • domain='expense_validator' • pipe='validate_expense' • -variable='['ocr_input']'`` -That means that the pipe validate_expense is missing the input `ocr_input` because one of the subpipe is needing it. - -NEVER WRITE THE INPUTS BY BREAKING THE LINE LIKE THIS: - -```plx -inputs = { - input_1 = "ConceptName1", - input_2 = "ConceptName2" -} -``` - - -- `output`: The name of the concept to output. The `ConceptName` should have the same name as the python class if you want structured output: - -# Structured Models Rules - -## Model Location and Registration - -- Create models for structured generations related to "some_domain" in `pipelex_libraries/pipelines/.py` -- Models must inherit from `StructuredContent` or appropriate content type - -## Model Structure - -Concepts and their structure classes are meant to indicate an idea. -A Concept MUST NEVER be a plural noun and you should never create a SomeConceptList: lists and arrays are implicitly handled by Pipelex according to the context. Just define SomeConcept. - -**IMPORTANT: Never create unnecessary structure classes that only refine native concepts without adding fields.** - -DO NOT create structures like: -```python -class Joke(TextContent): - """A humorous text that makes people laugh.""" - pass -``` - -If a concept only refines a native concept (like Text, Image, etc.) without adding new fields, simply declare it in the .plx file: -```plx -[concept] -Joke = "A humorous text that makes people laugh." -``` -If you simply need to refine another native concept, construct it like this: -```plx -[concept.Landscape] -refines = "Image" -``` - -Only create a Python structure class when you need to add specific fields: - -```python -from datetime import datetime -from typing import List, Optional -from pydantic import Field - -from pipelex.core.stuffs.structured_content import StructuredContent - -# IMPORTANT: THE CLASS MUST BE A SUBCLASS OF StructuredContent -class YourModel(StructuredContent): # Always be a subclass of StructuredContent - # Required fields - field1: str - field2: int - - # Optional fields with defaults - field3: Optional[str] = Field(None, "Description of field3") - field4: List[str] = Field(default_factory=list) - - # Date fields should remove timezone - date_field: Optional[datetime] = None -``` -## Usage - -Structures are meant to indicate what class to use for a particular Concept. In general they use the same name as the concept. - -Structure classes defined within `pipelex_libraries/pipelines/` are automatically loaded into the class_registry when setting up Pipelex, no need to do it manually. - - -## Best Practices for structures - -- Respect Pydantic v2 standards -- Use type hints for all fields -- Use `Field` declaration and write the description - - -## Pipe Controllers and Pipe Operator - -Look at the Pipes we have in order to adapt it. Pipes are organized in two categories: - -1. **Controllers** - For flow control: - - `PipeSequence` - For creating a sequence of multiple steps - - `PipeCondition` - If the next pipe depends of the expression of a stuff in the working memory - - `PipeParallel` - For parallelizing pipes - - `PipeBatch` - For running pipes in Batch over a ListContent - -2. **Operators** - For specific tasks: - - `PipeLLM` - Generate Text and Objects (include Vision LLM) - - `PipeOcr` - OCR Pipe - - `PipeImgGen` - Generate Images - - `PipeFunc` - For running classic python scripts - -# PipeSequence Guide - -## Purpose -PipeSequence executes multiple pipes in a defined order, where each step can use results from previous steps. - -## Basic Structure -```plx -[pipe.your_sequence_name] -type = "PipeSequence" -description = "Description of what this sequence does" -inputs = { input_name = "InputType" } # All the inputs of the sub pipes, except the ones generated by intermediate steps -output = "OutputType" -steps = [ - { pipe = "first_pipe", result = "first_result" }, - { pipe = "second_pipe", result = "second_result" }, - { pipe = "final_pipe", result = "final_result" } -] -``` - -## Key Components - -1. **Steps Array**: List of pipes to execute in sequence - - `pipe`: Name of the pipe to execute - - `result`: Name to assign to the pipe's output that will be in the working memory - -## Using PipeBatch in Steps - -You can use PipeBatch functionality within steps using `batch_over` and `batch_as`: - -```plx -steps = [ - { pipe = "process_items", batch_over = "input_list", batch_as = "current_item", result = "processed_items" - } -] -``` - -1. **batch_over**: Specifies a `ListContent` field to iterate over. Each item in the list will be processed individually and IN PARALLEL by the pipe. - - Must be a `ListContent` type containing the items to process - - Can reference inputs or results from previous steps - -2. **batch_as**: Defines the name that will be used to reference the current item being processed - - This name can be used in the pipe's input mappings - - Makes each item from the batch available as a single element - -The result of a batched step will be a `ListContent` containing the outputs from processing each item. - -# PipeCondition Controller - -The PipeCondition controller allows you to implement conditional logic in your pipeline, choosing which pipe to execute based on an evaluated expression. It supports both direct expressions and expression templates. - -## Usage in PLX Configuration - -### Basic Usage with Direct Expression - -```plx -[pipe.conditional_operation] -type = "PipeCondition" -description = "A conditonal pipe to decide wheter..." -inputs = { input_data = "CategoryInput" } -output = "native.Text" -expression = "input_data.category" - -[pipe.conditional_operation.pipe_map] -small = "process_small" -medium = "process_medium" -large = "process_large" -``` -or -```plx -[pipe.conditional_operation] -type = "PipeCondition" -description = "A conditonal pipe to decide wheter..." -inputs = { input_data = "CategoryInput" } -output = "native.Text" -expression_template = "{{ input_data.category }}" # Jinja2 code - -[pipe.conditional_operation.pipe_map] -small = "process_small" -medium = "process_medium" -large = "process_large" -``` - -## Key Parameters - -- `expression`: Direct boolean or string expression (mutually exclusive with expression_template) -- `expression_template`: Jinja2 template for more complex conditional logic (mutually exclusive with expression) -- `pipe_map`: Dictionary mapping expression results to pipe codes : -1 - The key on the left (`small`, `medium`) is the result of `expression` or `expression_template`. -2 - The value on the right (`process_small`, `process_medium`, ..) is the name of the pipce to trigger - -# PipeBatch Controller - -The PipeBatch controller allows you to apply a pipe operation to each element in a list of inputs in parallele. It is created via a PipeSequence. - -## Usage in PLX Configuration - -```plx -[pipe.sequence_with_batch] -type = "PipeSequence" -description = "A Sequence of pipes" -inputs = { input_data = "ConceptName" } -output = "OutputConceptName" -steps = [ - { pipe = "pipe_to_apply", batch_over = "input_list", batch_as = "current_item", result = "batch_results" } -] -``` - -## Key Parameters - -- `pipe`: The pipe operation to apply to each element in the batch -- `batch_over`: The name of the list in the context to iterate over -- `batch_as`: The name to use for the current element in the pipe's context -- `result`: Where to store the results of the batch operation - -# PipeLLM Guide - -## Purpose - -PipeLLM is used to: -1. Generate text or objects with LLMs -2. Process images with Vision LLMs - -## Basic Usage - -### Simple Text Generation -```plx -[pipe.write_story] -type = "PipeLLM" -description = "Write a short story" -output = "Text" -prompt_template = """ -Write a short story about a programmer. -""" -``` - -### Structured Data Extraction -```plx -[pipe.extract_info] -type = "PipeLLM" -description = "Extract information" -inputs = { text = "Text" } -output = "PersonInfo" -prompt_template = """ -Extract person information from this text: -@text -""" -``` - -### System Prompts -Add system-level instructions: -```plx -[pipe.expert_analysis] -type = "PipeLLM" -description = "Expert analysis" -output = "Analysis" -system_prompt = "You are a data analysis expert" -prompt_template = "Analyze this data" -``` - -### Multiple Outputs -Generate multiple results: -```plx -[pipe.generate_ideas] -type = "PipeLLM" -description = "Generate ideas" -output = "Idea" -nb_output = 3 # Generate exactly 3 ideas -# OR -multiple_output = true # Let the LLM decide how many to generate -``` - -### Vision Tasks -Process images with VLMs: -```plx -[pipe.analyze_image] -type = "PipeLLM" -description = "Analyze image" -inputs = { image = "Image" } # `image` is the name of the stuff that contains the Image. If its in a stuff, you can add something like `{ "page.image": "Image" } -output = "ImageAnalysis" -prompt_template = "Describe what you see in this image" -``` - -# PipeOCR Guide - -## Purpose - -Extract text and images from an image or a PDF - -## Basic Usage - -### Simple Text Generation -```plx -[pipe.extract_info] -type = "PipeOcr" -description = "extract the information" -inputs = { ocr_input = "PDF" } # or { ocr_input = "Image" } if its an image. This is the only input -output = "Page" -``` - -The input ALWAYS HAS TO BE `ocr_input` and the value is either of concept `Image` or `Pdf`. - -The output concept `Page` is a native concept, with the structure `PageContent`: -It corresponds to 1 page. Therefore, the PipeOcr is outputing a `ListContent` of `Page` - -```python -class TextAndImagesContent(StuffContent): - text: Optional[TextContent] - images: Optional[List[ImageContent]] - -class PageContent(StructuredContent): # CONCEPT IS "Page" - text_and_images: TextAndImagesContent - page_view: Optional[ImageContent] = None -``` -- `text_and_images` are the text, and the related images found in the input image or PDF. -- `page_view` is the screenshot of the whole pdf page/image. - -This rule explains how to write prompt templates in PipeLLM definitions. - -## Insert stuff inside a tagged block - -If the inserted text is supposedly long text, made of several lines or paragraphs, you want it inserted inside a block, possibly a block tagged and delimlited with proper syntax as one would do in a markdown documentation. To include stuff as a block, use the "@" prefix. - -Example template: -```plx -prompt_template = """ -Match the expense with its corresponding invoice: - -@expense - -@invoices -""" -``` -In this example, the expense data and the invoices data are obviously made of several lines each, that's why it makes sense to use the "@" prefix in order to have them delimited inside a block. Note that our preprocessor will automatically include the block's title, so it doens't need to be explictly written in the prompt template. - -**DO NOT write things like "Here is the expense: @expense".** -**DO write simply "@expense" alone in an isolated line.** - -## Insert stuff inline - -If the inserted text is short text and it makes sense to have it inserted directly into a sentence, you want it inserted inline. To insert stuff inline, use the "$" prefix. This will insert the stuff without delimiters and the content will be rendered as plain text. - -Example template: -```plx -prompt_template = """ -Your goal is to summarize everything related to $topic in the provided text: - -@text - -Please provide only the summary, with no additional text or explanations. -Your summary should not be longer than 2 sentences. -""" -``` - -Here, $topic will be inserted inline, whereas @text will be a a delimited block. -Be sure to make the proper choice of prefix for each insertion. - -**DO NOT write "$topic" alone in an isolated line.** -**DO write things like "Write an essay about $topic" included in an actual sentence.** - -# Example to execute a pipeline - -```python -import asyncio - -from pipelex import pretty_print -from pipelex.hub import get_pipeline_tracker, get_report_delegate -from pipelex.pipelex import Pipelex -from pipelex.pipeline.execute import execute_pipeline - -from cocode.pipelines.examples.extract_gantt.gantt import GanttChart - -SAMPLE_NAME = "extract_gantt" -IMAGE_URL = "assets/gantt/gantt_tree_house.png" - - -async def extract_gantt(image_url: str) -> GanttChart: - # Run the pipe - pipe_output = await execute_pipeline( - pipe_code="extract_gantt_by_steps", - inputs={ - "gantt_chart_image": { - "concept": "gantt.GanttImage", - "content": ImageContent(url=image_url), - } - }, - ) - # Output the result - return pipe_output.main_stuff_as(content_type=GanttChart) - - -# start Pipelex -Pipelex.make() - -# run sample using asyncio -gantt_chart = asyncio.run(extract_gantt(IMAGE_URL)) - -# Display cost report (tokens used and cost) -get_report_delegate().generate_report() -# output results -pretty_print(gantt_chart, title="Gantt Chart") -get_pipeline_tracker().output_flowchart() -``` - -The input memory is a dictionary of key-value pairs, where the key is the name of the input variable and the value provides details to make it a stuff object. The relevant definitions are: -```python -StuffContentOrData = Dict[str, Any] | StuffContent | List[Any] | str -ImplicitMemory = Dict[str, StuffContentOrData] -``` -As you can seen, we made it so different ways can be used to define that stuff using structured content or data. - -So here are a few concrete examples of calls to execute_pipeline with various ways to set up the input memory: - -```python -# Here we have a single input and it's a Text. -# If you assign a string, by default it will be considered as a TextContent. - pipe_output = await execute_pipeline( - pipe_code="master_advisory_orchestrator", - inputs={ - "user_input": problem_description, - }, - ) - -# Here we have a single input and it's a PDF. -# Because PDFContent is a native concept, we can use it directly as a value, -# the system knows what content it corresponds to: - pipe_output = await execute_pipeline( - pipe_code="power_extractor_dpe", - inputs={ - "ocr_input": PDFContent(url=pdf_url), - }, - ) - -# Here we have a single input and it's an Image. -# Because ImageContent is a native concept, we can use it directly as a value: - pipe_output = await execute_pipeline( - pipe_code="fashion_variation_pipeline", - inputs={ - "fashion_photo": ImageContent(url=image_url), - }, - ) - -# Here we have a single input, it's an image but -# its actually a more specific concept gantt.GanttImage which refines Image, -# so we must provide it using a dict with the concept and the content: - pipe_output = await execute_pipeline( - pipe_code="extract_gantt_by_steps", - inputs={ - "gantt_chart_image": { - "concept": "gantt.GanttImage", - "content": ImageContent(url=image_url), - } - }, - ) - -# Here is a more complex example with multiple inputs assigned using different ways: - pipe_output = await execute_pipeline( - pipe_code="retrieve_then_answer", - dynamic_output_concept_code="contracts.Fees", - inputs={ - "text": load_text_from_path(path=text_path), - "question": { - "concept": "answer.Question", - "content": question, - }, - "client_instructions": client_instructions, - }, - ) -``` - -ALWAYS RUN `make validate` when you are finished writing pipelines: This checks for errors. If there are errors, iterate until it works. -Then, create an example file to run the pipeline in the `examples` folder. -But don't write documentation unless asked explicitly to. - -# Rules to choose LLM models used in PipeLLMs. - -## LLM Handles - -In order to use it in a pipe, an LLM is referenced by its llm_handle and possibly by an llm_preset. -Both llm_handles and llm_presets are defined in this toml config file: [base_llm_deck.toml](./cocode/pipelex_libraries/llm_deck/base_llm_deck.toml) - -## LLM Handles - -An llm_handle matches the handle (an id of sorts) with the full specification of the LLM to use, i.e.: -- llm_name -- llm_version -- llm_platform_choice - -The declaration of llm_handles looks like this in toml syntax: -```toml -[llm_handles] -gpt-4o-2024-11-20 = { llm_name = "gpt-4o", llm_version = "2024-11-20" } -``` - -In mosty cases, we only want to use version "latest" and llm_platform_choice "default" in which case the declaration is simply a match of the llm_handle to the llm_name, like this: -```toml -best-claude = "claude-4-opus" -best-gemini = "gemini-2.5-pro" -best-mistral = "mistral-large" -``` - -And of course, llm_handles are automatically assigned for all models by their name, with version "latest" and llm_platform_choice "default". - -## Using an LLM Handle in a PipeLLM - -Here is an example of using an llm_handle to specify which LLM to use in a PipeLLM: - -```plx -[pipe.hello_world] -type = "PipeLLM" -description = "Write text about Hello World." -output = "Text" -llm = { llm_handle = "gpt-4o-mini", temperature = 0.9, max_tokens = "auto" } -prompt_template = """ -Write a haiku about Hello World. -""" -``` - -As you can see, to use the LLM, you must also indicate the temperature (float between 0 and 1) and max_tokens (either an int or the string "auto"). - -## LLM Presets - -Presets are meant to record the choice of an llm with its hyper parameters (temperature and max_tokens) if it's good for a particular task. LLM Presets are skill-oriented. - -Examples: -```toml -llm_to_reason = { llm_handle = "o4-mini", temperature = 1, max_tokens = "auto" } -llm_to_extract_invoice = { llm_handle = "claude-3-7-sonnet", temperature = 0.1, max_tokens = "auto" } -``` - -The interest is that these presets can be used to set the LLM choice in a PipeLLM, like this: - -```plx -[pipe.extract_invoice] -type = "PipeLLM" -description = "Extract invoice information from an invoice text transcript" -inputs = { invoice_text = "InvoiceText" } -output = "Invoice" -llm = "llm_to_extract_invoice" -prompt_template = """ -Extract invoice information from this invoice: - -The category of this invoice is: $invoice_details.category. - -@invoice_text -""" -``` - -The setting here `llm = "llm_to_extract_invoice"` works because "llm_to_extract_invoice" has been declared as an llm_preset in the deck. -You must not use an LLM preset in a PipeLLM that does not exist in the deck. If needed, you can add llm presets. - - -You can override the predefined llm presets in [overrides.toml](./cocode/pipelex_libraries/llm_deck/overrides.toml). - -These rules apply when writing unit tests. -- Always use pytest - -## Test file structure - -- Name test files with `test_` prefix -- Use descriptive names that match the functionality being tested -- Place test files in the appropriate test category directory: - - `tests/unit/` - for unit tests that test individual functions/classes in isolation - - `tests/integration/` - for integration tests that test component interactions - - `tests/e2e/` - for end-to-end tests that test complete workflows - - `tests/test_pipelines/` - for test pipeline definitions (PLX files and their structuring python files) -- Fixtures are defined in conftest.py modules at different levels of the hierarchy, their scope is handled by pytest -- Test data is placed inside test_data.py at different levels of the hierarchy, they must be imported with package paths from the root like `tests.pipelex.test_data`. Their content is all constants, regrouped inside classes to keep things tidy. -- Always put test inside Test classes. -- The pipelex pipelines should be stored in `tests/test_pipelines` as well as the related structured Output classes that inherit from `StructuredContent` - -## Markers - -Apply the appropriate markers: -- "llm: uses an LLM to generate text or objects" -- "imgg: uses an image generation AI" -- "inference: uses either an LLM or an image generation AI" -- "gha_disabled: will not be able to run properly on GitHub Actions" - -Several markers may be applied. For instance, if the test uses an LLM, then it uses inference, so you must mark with both `inference`and `llm`. - -## Tips - -- Never use the unittest.mock. Use pytest-mock - -## Test Class Structure - -Always group the tests of a module into a test class: - -```python -@pytest.mark.llm -@pytest.mark.inference -@pytest.mark.asyncio(loop_scope="class") -class TestFooBar: - @pytest.mark.parametrize( - "topic test_case_blueprint", - [ - TestCases.CASE_1, - TestCases.CASE_2, - ], - ) - async def test_pipe_processing( - self, - request: FixtureRequest, - topic: str, - test_case_blueprint: StuffBlueprint, - ): - # Test implementation -``` - -Sometimes it can be convenient to access the test's name in its body, for instance to include into a job_id. To achieve that, add the argument `request: FixtureRequest` into the signature and then you can get the test name using `cast(str, request.node.originalname), # type: ignore`. - -# Pipe tests - -## Required imports for pipe tests - -```python -import pytest -from pytest import FixtureRequest -from pipelex import log, pretty_print -from pipelex.core.stuffs.stuff_factory import StuffBlueprint, StuffFactory -from pipelex.core.memory.working_memory_factory import WorkingMemoryFactory -from pipelex.hub import get_report_delegate -from cocode.pipelines.base_library.retrieve import RetrievedExcerpt -from pipelex.config_pipelex import get_config - -from pipelex.core.pipe import PipeAbstract, update_job_metadata_for_pipe -from pipelex.core.pipes.pipe_output import PipeOutput, PipeOutputType -from pipelex.core.pipes.pipe_run_params import PipeRunParams -from pipelex.core.pipes.pipe_run_params import PipeRunParams -from pipelex.pipe_works.pipe_router_protocol import PipeRouterProtocol -``` - -## Pipe test implementation steps - -1. Create Stuff from blueprint: - -```python -stuff = StuffFactory.make_stuff( - concept_code="RetrievedExcerpt", - domain="retrieve", - content=RetrievedExcerpt(text="", justification="") - name="retrieved_text", -) -``` - -2. Create Working Memory: - -```python -working_memory = WorkingMemoryFactory.make_from_single_stuff(stuff=stuff) -``` - -3. Run the pipe: - -```python -pipe_output: PipeOutput = await pipe_router.run_pipe( - pipe_code="pipe_name", - pipe_run_params=PipeRunParamsFactory.make_run_params(), - working_memory=working_memory, - job_metadata=JobMetadata(), -) -``` - -4. Log output and generate report: - -```python -pretty_print(pipe_output, title=f"Pipe output") -get_report_delegate().generate_report() -``` - -5. Basic assertions: - -```python -assert pipe_output is not None -assert pipe_output.working_memory is not None -assert pipe_output.main_stuff is not None -``` - -## Test Data Organization - -- If it's not already there, create a `test_data.py` file in the test directory -- Define test cases using `StuffBlueprint`: - -```python -class TestCases: - CASE_BLUEPRINT_1 = StuffBlueprint( - name="test_case_1", - concept_code="domain.ConceptName1", - value="test_value" - ) - CASE_BLUEPRINT_2 = StuffBlueprint( - name="test_case_2", - concept_code="domain.ConceptName2", - value="test_value" - ) - - CASE_BLUEPRINTS: ClassVar[List[Tuple[str, str]]] = [ # topic, blueprint" - ("topic1", CASE_BLUEPRINT_1), - ("topic2", CASE_BLUEPRINT_2), - ] -``` - -Note how we avoid initializing a default mutable value within a class instance, instead we use ClassVar. -Also note that we provide a topic for the test case, which is purely for convenience. - -## Best Practices for Testing - -- Use parametrize for multiple test cases -- Test both success and failure cases -- Verify working memory state -- Check output structure and content -- Use meaningful test case names -- Include docstrings explaining test purpose -- Log outputs for debugging -- Generate reports for cost tracking - -# Test-Driven Development Guide - -This document outlines our test-driven development (TDD) process and the tools available for testing. - -## TDD Cycle - -1. **Write a Test First** -[pytest.mdc](pytest.mdc) - -2. **Write the Code** - - Implement the minimum amount of code needed to pass the test - - Follow the project's coding standards - - Keep it simple - don't write more than needed - -3. **Run Linting and Type Checking** -[coding_standards.mdc](coding_standards.mdc) - -4. **Refactor if needed** -If the code needs refactoring, with the best practices [coding_standards.mdc](coding_standards.mdc) - -5. **Validate tests** - -Remember: The key to TDD is writing the test first and letting it drive your implementation. Always run the full test suite and quality checks before considering a feature complete. - +# Pipelex Coding Rules + ## Guide to write or edit pipelines using the Pipelex language in .plx files - Always first write your "plan" in natural language, then transcribe it in pipelex. -- You should ALWAYS RUN the terminal command `make validate` when you are writing or editing a `.plx` file. It will ensure the pipe is runnable. If not, iterate. +- You should ALWAYS RUN validation when you are writing or editing a `.plx` file. It will ensure the pipe is runnable. If not, iterate. + - For a specific file: `pipelex validate path_to_file.plx` + - For all pipelines: `pipelex validate all` + - **IMPORTANT**: Ensure the Python virtual environment is activated before running `pipelex` commands. For standard installations, the venv is named `.venv` - always check that first. The commands will not work without proper venv activation. - Please use POSIX standard for files. (empty lines, no trailing whitespaces, etc.) ### Pipeline File Naming @@ -944,10 +23,10 @@ A pipeline file has three main sections: #### Domain Statement ```plx -domain = "domain_name" +domain = "domain_code" description = "Description of the domain" # Optional ``` -Note: The domain name usually matches the plx filename for single-file domains. For multi-file domains, use the subdirectory name. +Note: The domain code usually matches the plx filename for single-file domains. For multi-file domains, use the subdirectory name. #### Concept Definitions @@ -962,10 +41,10 @@ ConceptName = "Description of the concept" - Use PascalCase for concept names - Never use plurals (no "Stories", use "Story") - lists are handled implicitly by Pipelex - Avoid circumstantial adjectives (no "LargeText", use "Text") - focus on the essence of what the concept represents -- Don't redefine native concepts (Text, Image, PDF, TextAndImages, Number, Page) +- Don't redefine native concepts (Text, Image, PDF, TextAndImages, Number, Page, JSON) **Native Concepts:** -Pipelex provides built-in native concepts: `Text`, `Image`, `PDF`, `TextAndImages`, `Number`, `Page`. Use these directly or refine them when appropriate. +Pipelex provides built-in native concepts: `Text`, `Image`, `PDF`, `TextAndImages`, `Number`, `Page`, `JSON`. Use these directly or refine them when appropriate. **Refining Native Concepts:** To create a concept that specializes a native concept without adding fields: @@ -983,7 +62,7 @@ For details on how to structure concepts with fields, see the "Structuring Model ### Pipe Base Definition ```plx -[pipe.your_pipe_name] +[pipe.your_pipe_code] type = "PipeLLM" description = "A description of what your pipe does" inputs = { input_1 = "ConceptName1", input_2 = "ConceptName2" } @@ -993,7 +72,7 @@ output = "ConceptName" The pipes will all have at least this base definition. - `inputs`: Dictionary of key being the variable used in the prompts, and the value being the ConceptName. It should ALSO LIST THE INPUTS OF THE INTERMEDIATE STEPS (if PipeSequence) or of the conditional pipes (if PipeCondition). So If you have this error: -`StaticValidationError: missing_input_variable • domain='expense_validator' • pipe='validate_expense' • +`PipeValidationError: missing_input_variable • domain='expense_validator' • pipe='validate_expense' • variable='['invoice']'`` That means that the pipe validate_expense is missing the input `invoice` because one of the subpipe is needing it. @@ -1048,16 +127,16 @@ For concepts with structured fields, define them inline using TOML syntax: description = "A commercial document issued by a seller to a buyer" [concept.Invoice.structure] -invoice_number = "The unique invoice identifier" +invoice_number = "The unique invoice identifier" # This will be optional by default issue_date = { type = "date", description = "The date the invoice was issued", required = true } total_amount = { type = "number", description = "The total invoice amount", required = true } -vendor_name = "The name of the vendor" -line_items = { type = "list", item_type = "text", description = "List of items", required = false } +vendor_name = "The name of the vendor" # This will be optional by default +line_items = { type = "list", item_type = "text", description = "List of items" } ``` **Supported inline field types:** `text`, `integer`, `boolean`, `number`, `date`, `list`, `dict` -**Field properties:** `type`, `description`, `required` (default: true), `default_value`, `choices`, `item_type` (for lists), `key_type` and `value_type` (for dicts) +**Field properties:** `type`, `description`, `required` (default: false), `default_value`, `choices`, `item_type` (for lists), `key_type` and `value_type` (for dicts) **Simple syntax** (creates required text field): ```plx @@ -1066,7 +145,7 @@ field_name = "Field description" **Detailed syntax** (with explicit properties): ```plx -field_name = { type = "text", description = "Field description", required = false, default_value = "default" } +field_name = { type = "text", description = "Field description", default_value = "default" } ``` **3. Python StructuredContent Class (For Advanced Features)** @@ -1392,7 +471,7 @@ The PipeExtract operator is used to extract text and images from an image or a P [pipe.extract_info] type = "PipeExtract" description = "extract the information" -inputs = { document = "PDF" } # or { image = "Image" } if it's an image. This is the only input. +inputs = { document = "Document" } # or { image = "Image" } if it's an image. This is the only input. output = "Page" ``` @@ -1401,7 +480,7 @@ Using Extract Model Settings: [pipe.extract_with_model] type = "PipeExtract" description = "Extract with specific model" -inputs = { document = "PDF" } +inputs = { document = "Document" } output = "Page" model = "base_extract_mistral" # Use predefined extract preset or model alias ``` @@ -1509,15 +588,16 @@ $sales_rep.phone | $sales_rep.email """ ``` -#### Key Parameters +#### Key Parameters (Template Mode) -- `template`: Inline template string (mutually exclusive with template_name) +- `template`: Inline template string (mutually exclusive with template_name and construct) - `template_name`: Name of a predefined template (mutually exclusive with template) - `template_category`: Template type ("llm_prompt", "html", "markdown", "mermaid", etc.) - `templating_style`: Styling options for template rendering - `extra_context`: Additional context variables for template For more control, you can use a nested `template` section instead of the `template` field: + - `template.template`: The template string - `template.category`: Template type - `template.templating_style`: Styling options @@ -1525,9 +605,143 @@ For more control, you can use a nested `template` section instead of the `templa #### Template Variables Use the same variable insertion rules as PipeLLM: + - `@variable` for block insertion (multi-line content) - `$variable` for inline insertion (short text) +#### Construct Mode (for StructuredContent Output) + +PipeCompose can also generate `StructuredContent` objects using the `construct` section. This mode composes field values from fixed values, variable references, templates, or nested structures. + +**When to use construct mode:** + +- You need to output a structured object (not just Text) +- You want to deterministically compose fields from existing data +- No LLM is needed - just data composition and templating + +##### Basic Construct Usage + +```plx +[concept.SalesSummary] +description = "A structured sales summary" + +[concept.SalesSummary.structure] +report_title = { type = "text", description = "Title of the report" } +customer_name = { type = "text", description = "Customer name" } +deal_value = { type = "number", description = "Deal value" } +summary_text = { type = "text", description = "Generated summary text" } + +[pipe.compose_summary] +type = "PipeCompose" +description = "Compose a sales summary from deal data" +inputs = { deal = "Deal" } +output = "SalesSummary" + +[pipe.compose_summary.construct] +report_title = "Monthly Sales Report" +customer_name = { from = "deal.customer_name" } +deal_value = { from = "deal.amount" } +summary_text = { template = "Deal worth $deal.amount with $deal.customer_name" } +``` + +##### Field Composition Methods + +There are four ways to define field values in a construct: + +**1. Fixed Value (literal)** + +Use a literal value directly: + +```plx +[pipe.compose_report.construct] +report_title = "Annual Report" +report_year = 2024 +is_draft = false +``` + +**2. Variable Reference (`from`)** + +Get a value from working memory using a dotted path: + +```plx +[pipe.compose_report.construct] +customer_name = { from = "deal.customer_name" } +total_amount = { from = "order.total" } +street_address = { from = "customer.address.street" } +``` + +**3. Template (`template`)** + +Render a Jinja2 template with variable substitution: + +```plx +[pipe.compose_report.construct] +invoice_number = { template = "INV-$order.id" } +summary = { template = "Deal worth $deal.amount with $deal.customer_name on {{ current_date }}" } +``` + +**4. Nested Construct** + +For nested structures, use a TOML subsection: + +```plx +[pipe.compose_invoice.construct] +invoice_number = { template = "INV-$order.id" } +total = { from = "order.total_amount" } + +[pipe.compose_invoice.construct.billing_address] +street = { from = "customer.address.street" } +city = { from = "customer.address.city" } +country = "France" +``` + +##### Complete Construct Example + +```plx +domain = "invoicing" + +[concept.Address] +description = "A postal address" + +[concept.Address.structure] +street = { type = "text", description = "Street address" } +city = { type = "text", description = "City name" } +country = { type = "text", description = "Country name" } + +[concept.Invoice] +description = "An invoice document" + +[concept.Invoice.structure] +invoice_number = { type = "text", description = "Invoice number" } +total = { type = "number", description = "Total amount" } + +[pipe.compose_invoice] +type = "PipeCompose" +description = "Compose an invoice from order and customer data" +inputs = { order = "Order", customer = "Customer" } +output = "Invoice" + +[pipe.compose_invoice.construct] +invoice_number = { template = "INV-$order.id" } +total = { from = "order.total_amount" } + +[pipe.compose_invoice.construct.billing_address] +street = { from = "customer.address.street" } +city = { from = "customer.address.city" } +country = "France" +``` + +##### Key Parameters (Construct Mode) + +- `construct`: Dictionary mapping field names to their composition rules +- Each field can be: + - A literal value (string, number, boolean) + - A dict with `from` key for variable reference + - A dict with `template` key for template rendering + - A nested dict for nested structures + +**Note:** You must use either `template` or `construct`, not both. They are mutually exclusive. + ### PipeImgGen operator The PipeImgGen operator is used to generate images using AI image generation models. @@ -1741,7 +955,7 @@ Presets are meant to record the choice of an llm with its hyper parameters (temp Examples: ```toml -llm_for_complex_reasoning = { model = "base-claude", temperature = 1 } +llm_to_engineer = { model = "base-claude", temperature = 1 } llm_to_extract_invoice = { model = "claude-3-7-sonnet", temperature = 0.1, max_tokens = "auto" } ``` @@ -1770,7 +984,11 @@ You can override the predefined llm presets by setting them in `.pipelex/inferen --- -ALWAYS RUN `make validate` when you are finished writing pipelines: This checks for errors. If there are errors, iterate until it works. +ALWAYS RUN validation when you are finished writing pipelines: This checks for errors. If there are errors, iterate until it works. +- For a specific bundle/file: `pipelex validate path_to_file.plx` +- For all pipelines: `pipelex validate all` +- Remember: Ensure your Python virtual environment is activated (typically `.venv` for standard installations) before running `pipelex` commands. + Then, create an example file to run the pipeline in the `examples` folder. But don't write documentation unless asked explicitly to. @@ -1825,7 +1043,7 @@ async def extract_gantt(image_url: str) -> GanttChart: # Run the pipe pipe_output = await execute_pipeline( pipe_code="extract_gantt_by_steps", - input_memory={ + inputs={ "gantt_chart_image": { "concept": "gantt.GanttImage", "content": ImageContent(url=image_url), @@ -1864,18 +1082,18 @@ So here are a few concrete examples of calls to execute_pipeline with various wa ## If you assign a string, by default it will be considered as a TextContent. pipe_output = await execute_pipeline( pipe_code="master_advisory_orchestrator", - input_memory={ + inputs={ "user_input": problem_description, }, ) -## Here we have a single input and it's a PDF. -## Because PDFContent is a native concept, we can use it directly as a value, +## Here we have a single input and it's a document. +## Because DocumentContent is a native concept, we can use it directly as a value, ## the system knows what content it corresponds to: pipe_output = await execute_pipeline( pipe_code="power_extractor_dpe", - input_memory={ - "document": PDFContent(url=pdf_url), + inputs={ + "document": DocumentContent(url=pdf_url), }, ) @@ -1883,7 +1101,7 @@ So here are a few concrete examples of calls to execute_pipeline with various wa ## Because ImageContent is a native concept, we can use it directly as a value: pipe_output = await execute_pipeline( pipe_code="fashion_variation_pipeline", - input_memory={ + inputs={ "fashion_photo": ImageContent(url=image_url), }, ) @@ -1893,7 +1111,7 @@ So here are a few concrete examples of calls to execute_pipeline with various wa ## so we must provide it using a dict with the concept and the content: pipe_output = await execute_pipeline( pipe_code="extract_gantt_by_steps", - input_memory={ + inputs={ "gantt_chart_image": { "concept": "gantt.GanttImage", "content": ImageContent(url=image_url), @@ -1905,7 +1123,7 @@ So here are a few concrete examples of calls to execute_pipeline with various wa pipe_output = await execute_pipeline( pipe_code="retrieve_then_answer", dynamic_output_concept_code="contracts.Fees", - input_memory={ + inputs={ "text": load_text_from_path(path=text_path), "question": { "concept": "answer.Question", @@ -1998,82 +1216,4 @@ result_list = pipe_output.main_stuff_as_items(item_type=GanttChart) ``` --- - -## Rules to choose LLM models used in PipeLLMs. - -### LLM Configuration System - -In order to use it in a pipe, an LLM is referenced by its llm_handle (alias) and possibly by an llm_preset. -LLM configurations are managed through the new inference backend system with files located in `.pipelex/inference/`: - -- **Model Deck**: `.pipelex/inference/deck/base_deck.toml` and `.pipelex/inference/deck/overrides.toml` -- **Backends**: `.pipelex/inference/backends.toml` and `.pipelex/inference/backends/*.toml` -- **Routing**: `.pipelex/inference/routing_profiles.toml` - -### LLM Handles - -An llm_handle can be either: -1. **A direct model name** (like "gpt-4o-mini", "claude-3-sonnet") - automatically available for all models loaded by the inference backend system -2. **An alias** - user-defined shortcuts that map to model names, defined in the `[aliases]` section: - -```toml -[aliases] -base-claude = "claude-4.5-sonnet" -base-gpt = "gpt-5" -base-gemini = "gemini-2.5-flash" -base-mistral = "mistral-medium" -``` - -The system first looks for direct model names, then checks aliases if no direct match is found. The system handles model routing through backends automatically. - -### Using an LLM Handle in a PipeLLM - -Here is an example of using an llm_handle to specify which LLM to use in a PipeLLM: - -```plx -[pipe.hello_world] -type = "PipeLLM" -description = "Write text about Hello World." -output = "Text" -model = { model = "gpt-5", temperature = 0.9 } -prompt = """ -Write a haiku about Hello World. -""" -``` - -As you can see, to use the LLM, you must also indicate the temperature (float between 0 and 1) and max_tokens (either an int or the string "auto"). - -### LLM Presets - -Presets are meant to record the choice of an llm with its hyper parameters (temperature and max_tokens) if it's good for a particular task. LLM Presets are skill-oriented. - -Examples: -```toml -llm_for_complex_reasoning = { model = "base-claude", temperature = 1 } -llm_to_extract_invoice = { model = "claude-3-7-sonnet", temperature = 0.1, max_tokens = "auto" } -``` - -The interest is that these presets can be used to set the LLM choice in a PipeLLM, like this: - -```plx -[pipe.extract_invoice] -type = "PipeLLM" -description = "Extract invoice information from an invoice text transcript" -inputs = { invoice_text = "InvoiceText" } -output = "Invoice" -model = "llm_to_extract_invoice" -prompt = """ -Extract invoice information from this invoice: - -The category of this invoice is: $invoice_details.category. - -@invoice_text -""" -``` - -The setting here `model = "llm_to_extract_invoice"` works because "llm_to_extract_invoice" has been declared as an llm_preset in the deck. -You must not use an LLM preset in a PipeLLM that does not exist in the deck. If needed, you can add llm presets. - - -You can override the predefined llm presets by setting them in `.pipelex/inference/deck/overrides.toml`. diff --git a/CLAUDE.md b/CLAUDE.md index a9da93c..af4572e 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -1,935 +1,13 @@ -Concatenation -# Coding Standards & Best Practices - -This document outlines the core coding standards, best practices, and quality control procedures for the codebase. - -## Type Hints - -1. **Always Use Type Hints** - - Every function parameter must be typed - - Every function return must be typed - - Use type hints for all variables where type is not obvious - - Use types with Uppercase first letter (Dict[], List[], etc.) - -2. **StrEnum** - - Import from `pipelex.types`: - ```python - from pipelex.types import StrEnum - ``` - -## BaseModel Standards - -- Respect Pydantic v2 standards -- Keep models focused and single-purpose -- Use descriptive field names -- Use type hints for all fields -- Document complex validations -- Use Optional[] for nullable fields -- Use Field(default_factory=...) for mutable defaults - -## Factory Pattern - -- Use Factory Pattern for object creation when dealing with multiple implementations - -## Documentation - -1. **Docstring Format** - ```python - def process_image(image_path: str, size: Tuple[int, int]) -> bytes: - """Process and resize an image. - - Args: - image_path: Path to the source image - size: Tuple of (width, height) for resizing - - Returns: - Processed image as bytes - """ - pass - ``` - -2. **Class Documentation** - ```python - class ImageProcessor: - """Handles image processing operations. - - Provides methods for resizing, converting, and optimizing images. - """ - ``` - -## Error Handling - -1. **Graceful Error Handling** - - Use try/except blocks with specific exceptions - - Convert third-party exceptions to custom ones - ```python - try: - from fal_client import AsyncClient as FalAsyncClient - except ImportError as exc: - raise MissingDependencyError( - "fal-client", "fal", - "The fal-client SDK is required to use FAL models." - ) from exc - ``` - -## Code Quality Checks - -### Linting and Type Checking - -Before finalizing a task, run: -```bash -make fix-unused-imports -make check -``` - -This runs multiple code quality tools: -- Pyright: Static type checking -- Ruff: Fast Python linter -- Mypy: Static type checker - -Always fix any issues reported by these tools before proceeding. - -### Running Tests - -1. **Quick Test Run** (no LLM/image generation): - ```bash - make tp - ``` - Runs tests with markers: `(dry_runnable or not (inference or llm or imgg or ocr)) and not (needs_output or pipelex_api)` - -2. **Specific Tests**: - ```bash - make tp TEST=TestClassName - # or - make tp TEST=test_function_name - ``` - Note: Matches names starting with the provided string. - -**Important**: Never run `make ti`, `make test-inference`, `make to`, `make test-ocr`, `make tg`, or `make test-imgg` - these use costly inference. - -## Pipelines - -- All pipeline definitions go in `cocode/pipelex_libraries/pipelines/` -- Always validate pipelines after creation/edit with `make validate`. - Iterate if there are errors. - -## Project Structure - -- **Pipelines**: `cocode/pipelex_libraries/pipelines/` -- **Tests**: `tests/` directory -- **Documentation**: `docs/` directory - -# Pipeline Guide - -- Always first write your "plan" in natural langage, then transcribe it in pipelex. -- You should ALWAYS RUN the terminal command `make validate` when you are writing a `.plx` file. It will ensure the pipe is runnable. If not, iterate. -- Please use POSIX standard for files. (enmpty lines, no trailing whitespaces, etc.) - -# Pipeline Structure Guide - -## Pipeline File Naming -- Files must be `.plx` for pipelines (Always add an empty line at the end of the file, and do not add trailing whitespaces to PLX files at all) -- Files must be `.py` for structures -- Use descriptive names in `snake_case` - -## Pipeline File Structure -A pipeline file has three main sections: -1. Domain statement -2. Concept definitions -3. Pipe definitions - -### Domain Statement -```plx -domain = "domain_name" -description = "Description of the domain" # Optional -``` -Note: The domain name usually matches the plx filename for single-file domains. For multi-file domains, use the subdirectory name. - -### Concept Definitions -```plx -[concept] -ConceptName = "Description of the concept" # Should be the same name as the Structure ClassName you want to output -``` - -Important Rules: -- Use PascalCase for concept names -- Never use plurals (no "Stories", use "Story") -- Avoid adjectives (no "LargeText", use "Text") -- Don't redefine native concepts (Text, Image, PDF, TextAndImages, Number) -yes -### Pipe Definitions - -## Pipe Base Structure - -```plx -[pipe.your_pipe_name] -type = "PipeLLM" -description = "A description of what your pipe does" -inputs = { input_1 = "ConceptName1", input_2 = "ConceptName2" } -output = "ConceptName" -``` - -DO NOT WRITE: -```plx -[pipe.your_pipe_name] -type = "pipe_sequence" -``` - -But it should be: - -```plx -[pipe.your_pipe_name] -type = "PipeSequence" -description = "....." -``` - -The pipes will all have at least this base structure. -- `inputs`: Dictionnary of key behing the variable used in the prompts, and the value behing the ConceptName. It should ALSO LIST THE INPUTS OF THE INTERMEDIATE STEPS (if pipeSequence) or of the conditionnal pipes (if pipeCondition). -So If you have this error: -`StaticValidationError: missing_input_variable • domain='expense_validator' • pipe='validate_expense' • -variable='['ocr_input']'`` -That means that the pipe validate_expense is missing the input `ocr_input` because one of the subpipe is needing it. - -NEVER WRITE THE INPUTS BY BREAKING THE LINE LIKE THIS: - -```plx -inputs = { - input_1 = "ConceptName1", - input_2 = "ConceptName2" -} -``` - - -- `output`: The name of the concept to output. The `ConceptName` should have the same name as the python class if you want structured output: - -# Structured Models Rules - -## Model Location and Registration - -- Create models for structured generations related to "some_domain" in `pipelex_libraries/pipelines/.py` -- Models must inherit from `StructuredContent` or appropriate content type - -## Model Structure - -Concepts and their structure classes are meant to indicate an idea. -A Concept MUST NEVER be a plural noun and you should never create a SomeConceptList: lists and arrays are implicitly handled by Pipelex according to the context. Just define SomeConcept. - -**IMPORTANT: Never create unnecessary structure classes that only refine native concepts without adding fields.** - -DO NOT create structures like: -```python -class Joke(TextContent): - """A humorous text that makes people laugh.""" - pass -``` - -If a concept only refines a native concept (like Text, Image, etc.) without adding new fields, simply declare it in the .plx file: -```plx -[concept] -Joke = "A humorous text that makes people laugh." -``` -If you simply need to refine another native concept, construct it like this: -```plx -[concept.Landscape] -refines = "Image" -``` - -Only create a Python structure class when you need to add specific fields: - -```python -from datetime import datetime -from typing import List, Optional -from pydantic import Field - -from pipelex.core.stuffs.structured_content import StructuredContent - -# IMPORTANT: THE CLASS MUST BE A SUBCLASS OF StructuredContent -class YourModel(StructuredContent): # Always be a subclass of StructuredContent - # Required fields - field1: str - field2: int - - # Optional fields with defaults - field3: Optional[str] = Field(None, "Description of field3") - field4: List[str] = Field(default_factory=list) - - # Date fields should remove timezone - date_field: Optional[datetime] = None -``` -## Usage - -Structures are meant to indicate what class to use for a particular Concept. In general they use the same name as the concept. - -Structure classes defined within `pipelex_libraries/pipelines/` are automatically loaded into the class_registry when setting up Pipelex, no need to do it manually. - - -## Best Practices for structures - -- Respect Pydantic v2 standards -- Use type hints for all fields -- Use `Field` declaration and write the description - - -## Pipe Controllers and Pipe Operator - -Look at the Pipes we have in order to adapt it. Pipes are organized in two categories: - -1. **Controllers** - For flow control: - - `PipeSequence` - For creating a sequence of multiple steps - - `PipeCondition` - If the next pipe depends of the expression of a stuff in the working memory - - `PipeParallel` - For parallelizing pipes - - `PipeBatch` - For running pipes in Batch over a ListContent - -2. **Operators** - For specific tasks: - - `PipeLLM` - Generate Text and Objects (include Vision LLM) - - `PipeOcr` - OCR Pipe - - `PipeImgGen` - Generate Images - - `PipeFunc` - For running classic python scripts - -# PipeSequence Guide - -## Purpose -PipeSequence executes multiple pipes in a defined order, where each step can use results from previous steps. - -## Basic Structure -```plx -[pipe.your_sequence_name] -type = "PipeSequence" -description = "Description of what this sequence does" -inputs = { input_name = "InputType" } # All the inputs of the sub pipes, except the ones generated by intermediate steps -output = "OutputType" -steps = [ - { pipe = "first_pipe", result = "first_result" }, - { pipe = "second_pipe", result = "second_result" }, - { pipe = "final_pipe", result = "final_result" } -] -``` - -## Key Components - -1. **Steps Array**: List of pipes to execute in sequence - - `pipe`: Name of the pipe to execute - - `result`: Name to assign to the pipe's output that will be in the working memory - -## Using PipeBatch in Steps - -You can use PipeBatch functionality within steps using `batch_over` and `batch_as`: - -```plx -steps = [ - { pipe = "process_items", batch_over = "input_list", batch_as = "current_item", result = "processed_items" - } -] -``` - -1. **batch_over**: Specifies a `ListContent` field to iterate over. Each item in the list will be processed individually and IN PARALLEL by the pipe. - - Must be a `ListContent` type containing the items to process - - Can reference inputs or results from previous steps - -2. **batch_as**: Defines the name that will be used to reference the current item being processed - - This name can be used in the pipe's input mappings - - Makes each item from the batch available as a single element - -The result of a batched step will be a `ListContent` containing the outputs from processing each item. - -# PipeCondition Controller - -The PipeCondition controller allows you to implement conditional logic in your pipeline, choosing which pipe to execute based on an evaluated expression. It supports both direct expressions and expression templates. - -## Usage in PLX Configuration - -### Basic Usage with Direct Expression - -```plx -[pipe.conditional_operation] -type = "PipeCondition" -description = "A conditonal pipe to decide wheter..." -inputs = { input_data = "CategoryInput" } -output = "native.Text" -expression = "input_data.category" - -[pipe.conditional_operation.pipe_map] -small = "process_small" -medium = "process_medium" -large = "process_large" -``` -or -```plx -[pipe.conditional_operation] -type = "PipeCondition" -description = "A conditonal pipe to decide wheter..." -inputs = { input_data = "CategoryInput" } -output = "native.Text" -expression_template = "{{ input_data.category }}" # Jinja2 code - -[pipe.conditional_operation.pipe_map] -small = "process_small" -medium = "process_medium" -large = "process_large" -``` - -## Key Parameters - -- `expression`: Direct boolean or string expression (mutually exclusive with expression_template) -- `expression_template`: Jinja2 template for more complex conditional logic (mutually exclusive with expression) -- `pipe_map`: Dictionary mapping expression results to pipe codes : -1 - The key on the left (`small`, `medium`) is the result of `expression` or `expression_template`. -2 - The value on the right (`process_small`, `process_medium`, ..) is the name of the pipce to trigger - -# PipeBatch Controller - -The PipeBatch controller allows you to apply a pipe operation to each element in a list of inputs in parallele. It is created via a PipeSequence. - -## Usage in PLX Configuration - -```plx -[pipe.sequence_with_batch] -type = "PipeSequence" -description = "A Sequence of pipes" -inputs = { input_data = "ConceptName" } -output = "OutputConceptName" -steps = [ - { pipe = "pipe_to_apply", batch_over = "input_list", batch_as = "current_item", result = "batch_results" } -] -``` - -## Key Parameters - -- `pipe`: The pipe operation to apply to each element in the batch -- `batch_over`: The name of the list in the context to iterate over -- `batch_as`: The name to use for the current element in the pipe's context -- `result`: Where to store the results of the batch operation - -# PipeLLM Guide - -## Purpose - -PipeLLM is used to: -1. Generate text or objects with LLMs -2. Process images with Vision LLMs - -## Basic Usage - -### Simple Text Generation -```plx -[pipe.write_story] -type = "PipeLLM" -description = "Write a short story" -output = "Text" -prompt_template = """ -Write a short story about a programmer. -""" -``` - -### Structured Data Extraction -```plx -[pipe.extract_info] -type = "PipeLLM" -description = "Extract information" -inputs = { text = "Text" } -output = "PersonInfo" -prompt_template = """ -Extract person information from this text: -@text -""" -``` - -### System Prompts -Add system-level instructions: -```plx -[pipe.expert_analysis] -type = "PipeLLM" -description = "Expert analysis" -output = "Analysis" -system_prompt = "You are a data analysis expert" -prompt_template = "Analyze this data" -``` - -### Multiple Outputs -Generate multiple results: -```plx -[pipe.generate_ideas] -type = "PipeLLM" -description = "Generate ideas" -output = "Idea" -nb_output = 3 # Generate exactly 3 ideas -# OR -multiple_output = true # Let the LLM decide how many to generate -``` - -### Vision Tasks -Process images with VLMs: -```plx -[pipe.analyze_image] -type = "PipeLLM" -description = "Analyze image" -inputs = { image = "Image" } # `image` is the name of the stuff that contains the Image. If its in a stuff, you can add something like `{ "page.image": "Image" } -output = "ImageAnalysis" -prompt_template = "Describe what you see in this image" -``` - -# PipeOCR Guide - -## Purpose - -Extract text and images from an image or a PDF - -## Basic Usage - -### Simple Text Generation -```plx -[pipe.extract_info] -type = "PipeOcr" -description = "extract the information" -inputs = { ocr_input = "PDF" } # or { ocr_input = "Image" } if its an image. This is the only input -output = "Page" -``` - -The input ALWAYS HAS TO BE `ocr_input` and the value is either of concept `Image` or `Pdf`. - -The output concept `Page` is a native concept, with the structure `PageContent`: -It corresponds to 1 page. Therefore, the PipeOcr is outputing a `ListContent` of `Page` - -```python -class TextAndImagesContent(StuffContent): - text: Optional[TextContent] - images: Optional[List[ImageContent]] - -class PageContent(StructuredContent): # CONCEPT IS "Page" - text_and_images: TextAndImagesContent - page_view: Optional[ImageContent] = None -``` -- `text_and_images` are the text, and the related images found in the input image or PDF. -- `page_view` is the screenshot of the whole pdf page/image. - -This rule explains how to write prompt templates in PipeLLM definitions. - -## Insert stuff inside a tagged block - -If the inserted text is supposedly long text, made of several lines or paragraphs, you want it inserted inside a block, possibly a block tagged and delimlited with proper syntax as one would do in a markdown documentation. To include stuff as a block, use the "@" prefix. - -Example template: -```plx -prompt_template = """ -Match the expense with its corresponding invoice: - -@expense - -@invoices -""" -``` -In this example, the expense data and the invoices data are obviously made of several lines each, that's why it makes sense to use the "@" prefix in order to have them delimited inside a block. Note that our preprocessor will automatically include the block's title, so it doens't need to be explictly written in the prompt template. - -**DO NOT write things like "Here is the expense: @expense".** -**DO write simply "@expense" alone in an isolated line.** - -## Insert stuff inline - -If the inserted text is short text and it makes sense to have it inserted directly into a sentence, you want it inserted inline. To insert stuff inline, use the "$" prefix. This will insert the stuff without delimiters and the content will be rendered as plain text. - -Example template: -```plx -prompt_template = """ -Your goal is to summarize everything related to $topic in the provided text: - -@text - -Please provide only the summary, with no additional text or explanations. -Your summary should not be longer than 2 sentences. -""" -``` - -Here, $topic will be inserted inline, whereas @text will be a a delimited block. -Be sure to make the proper choice of prefix for each insertion. - -**DO NOT write "$topic" alone in an isolated line.** -**DO write things like "Write an essay about $topic" included in an actual sentence.** - -# Example to execute a pipeline - -```python -import asyncio - -from pipelex import pretty_print -from pipelex.hub import get_pipeline_tracker, get_report_delegate -from pipelex.pipelex import Pipelex -from pipelex.pipeline.execute import execute_pipeline - -from cocode.pipelines.examples.extract_gantt.gantt import GanttChart - -SAMPLE_NAME = "extract_gantt" -IMAGE_URL = "assets/gantt/gantt_tree_house.png" - - -async def extract_gantt(image_url: str) -> GanttChart: - # Run the pipe - pipe_output = await execute_pipeline( - pipe_code="extract_gantt_by_steps", - inputs={ - "gantt_chart_image": { - "concept": "gantt.GanttImage", - "content": ImageContent(url=image_url), - } - }, - ) - # Output the result - return pipe_output.main_stuff_as(content_type=GanttChart) - - -# start Pipelex -Pipelex.make() - -# run sample using asyncio -gantt_chart = asyncio.run(extract_gantt(IMAGE_URL)) - -# Display cost report (tokens used and cost) -get_report_delegate().generate_report() -# output results -pretty_print(gantt_chart, title="Gantt Chart") -get_pipeline_tracker().output_flowchart() -``` - -The input memory is a dictionary of key-value pairs, where the key is the name of the input variable and the value provides details to make it a stuff object. The relevant definitions are: -```python -StuffContentOrData = Dict[str, Any] | StuffContent | List[Any] | str -ImplicitMemory = Dict[str, StuffContentOrData] -``` -As you can seen, we made it so different ways can be used to define that stuff using structured content or data. - -So here are a few concrete examples of calls to execute_pipeline with various ways to set up the input memory: - -```python -# Here we have a single input and it's a Text. -# If you assign a string, by default it will be considered as a TextContent. - pipe_output = await execute_pipeline( - pipe_code="master_advisory_orchestrator", - inputs={ - "user_input": problem_description, - }, - ) - -# Here we have a single input and it's a PDF. -# Because PDFContent is a native concept, we can use it directly as a value, -# the system knows what content it corresponds to: - pipe_output = await execute_pipeline( - pipe_code="power_extractor_dpe", - inputs={ - "ocr_input": PDFContent(url=pdf_url), - }, - ) - -# Here we have a single input and it's an Image. -# Because ImageContent is a native concept, we can use it directly as a value: - pipe_output = await execute_pipeline( - pipe_code="fashion_variation_pipeline", - inputs={ - "fashion_photo": ImageContent(url=image_url), - }, - ) - -# Here we have a single input, it's an image but -# its actually a more specific concept gantt.GanttImage which refines Image, -# so we must provide it using a dict with the concept and the content: - pipe_output = await execute_pipeline( - pipe_code="extract_gantt_by_steps", - inputs={ - "gantt_chart_image": { - "concept": "gantt.GanttImage", - "content": ImageContent(url=image_url), - } - }, - ) - -# Here is a more complex example with multiple inputs assigned using different ways: - pipe_output = await execute_pipeline( - pipe_code="retrieve_then_answer", - dynamic_output_concept_code="contracts.Fees", - inputs={ - "text": load_text_from_path(path=text_path), - "question": { - "concept": "answer.Question", - "content": question, - }, - "client_instructions": client_instructions, - }, - ) -``` - -ALWAYS RUN `make validate` when you are finished writing pipelines: This checks for errors. If there are errors, iterate until it works. -Then, create an example file to run the pipeline in the `examples` folder. -But don't write documentation unless asked explicitly to. - -# Rules to choose LLM models used in PipeLLMs. - -## LLM Handles - -In order to use it in a pipe, an LLM is referenced by its llm_handle and possibly by an llm_preset. -Both llm_handles and llm_presets are defined in this toml config file: [base_llm_deck.toml](./cocode/pipelex_libraries/llm_deck/base_llm_deck.toml) - -## LLM Handles - -An llm_handle matches the handle (an id of sorts) with the full specification of the LLM to use, i.e.: -- llm_name -- llm_version -- llm_platform_choice - -The declaration of llm_handles looks like this in toml syntax: -```toml -[llm_handles] -gpt-4o-2024-11-20 = { llm_name = "gpt-4o", llm_version = "2024-11-20" } -``` - -In mosty cases, we only want to use version "latest" and llm_platform_choice "default" in which case the declaration is simply a match of the llm_handle to the llm_name, like this: -```toml -best-claude = "claude-4-opus" -best-gemini = "gemini-2.5-pro" -best-mistral = "mistral-large" -``` - -And of course, llm_handles are automatically assigned for all models by their name, with version "latest" and llm_platform_choice "default". - -## Using an LLM Handle in a PipeLLM - -Here is an example of using an llm_handle to specify which LLM to use in a PipeLLM: - -```plx -[pipe.hello_world] -type = "PipeLLM" -description = "Write text about Hello World." -output = "Text" -llm = { llm_handle = "gpt-4o-mini", temperature = 0.9, max_tokens = "auto" } -prompt_template = """ -Write a haiku about Hello World. -""" -``` - -As you can see, to use the LLM, you must also indicate the temperature (float between 0 and 1) and max_tokens (either an int or the string "auto"). - -## LLM Presets - -Presets are meant to record the choice of an llm with its hyper parameters (temperature and max_tokens) if it's good for a particular task. LLM Presets are skill-oriented. - -Examples: -```toml -llm_to_reason = { llm_handle = "o4-mini", temperature = 1, max_tokens = "auto" } -llm_to_extract_invoice = { llm_handle = "claude-3-7-sonnet", temperature = 0.1, max_tokens = "auto" } -``` - -The interest is that these presets can be used to set the LLM choice in a PipeLLM, like this: - -```plx -[pipe.extract_invoice] -type = "PipeLLM" -description = "Extract invoice information from an invoice text transcript" -inputs = { invoice_text = "InvoiceText" } -output = "Invoice" -llm = "llm_to_extract_invoice" -prompt_template = """ -Extract invoice information from this invoice: - -The category of this invoice is: $invoice_details.category. - -@invoice_text -""" -``` - -The setting here `llm = "llm_to_extract_invoice"` works because "llm_to_extract_invoice" has been declared as an llm_preset in the deck. -You must not use an LLM preset in a PipeLLM that does not exist in the deck. If needed, you can add llm presets. - - -You can override the predefined llm presets in [overrides.toml](./cocode/pipelex_libraries/llm_deck/overrides.toml). - -These rules apply when writing unit tests. -- Always use pytest - -## Test file structure - -- Name test files with `test_` prefix -- Use descriptive names that match the functionality being tested -- Place test files in the appropriate test category directory: - - `tests/unit/` - for unit tests that test individual functions/classes in isolation - - `tests/integration/` - for integration tests that test component interactions - - `tests/e2e/` - for end-to-end tests that test complete workflows - - `tests/test_pipelines/` - for test pipeline definitions (PLX files and their structuring python files) -- Fixtures are defined in conftest.py modules at different levels of the hierarchy, their scope is handled by pytest -- Test data is placed inside test_data.py at different levels of the hierarchy, they must be imported with package paths from the root like `tests.pipelex.test_data`. Their content is all constants, regrouped inside classes to keep things tidy. -- Always put test inside Test classes. -- The pipelex pipelines should be stored in `tests/test_pipelines` as well as the related structured Output classes that inherit from `StructuredContent` - -## Markers - -Apply the appropriate markers: -- "llm: uses an LLM to generate text or objects" -- "imgg: uses an image generation AI" -- "inference: uses either an LLM or an image generation AI" -- "gha_disabled: will not be able to run properly on GitHub Actions" - -Several markers may be applied. For instance, if the test uses an LLM, then it uses inference, so you must mark with both `inference`and `llm`. - -## Tips - -- Never use the unittest.mock. Use pytest-mock - -## Test Class Structure - -Always group the tests of a module into a test class: - -```python -@pytest.mark.llm -@pytest.mark.inference -@pytest.mark.asyncio(loop_scope="class") -class TestFooBar: - @pytest.mark.parametrize( - "topic test_case_blueprint", - [ - TestCases.CASE_1, - TestCases.CASE_2, - ], - ) - async def test_pipe_processing( - self, - request: FixtureRequest, - topic: str, - test_case_blueprint: StuffBlueprint, - ): - # Test implementation -``` - -Sometimes it can be convenient to access the test's name in its body, for instance to include into a job_id. To achieve that, add the argument `request: FixtureRequest` into the signature and then you can get the test name using `cast(str, request.node.originalname), # type: ignore`. - -# Pipe tests - -## Required imports for pipe tests - -```python -import pytest -from pytest import FixtureRequest -from pipelex import log, pretty_print -from pipelex.core.stuffs.stuff_factory import StuffBlueprint, StuffFactory -from pipelex.core.memory.working_memory_factory import WorkingMemoryFactory -from pipelex.hub import get_report_delegate -from cocode.pipelines.base_library.retrieve import RetrievedExcerpt -from pipelex.config_pipelex import get_config - -from pipelex.core.pipe import PipeAbstract, update_job_metadata_for_pipe -from pipelex.core.pipes.pipe_output import PipeOutput, PipeOutputType -from pipelex.core.pipes.pipe_run_params import PipeRunParams -from pipelex.core.pipes.pipe_run_params import PipeRunParams -from pipelex.pipe_works.pipe_router_protocol import PipeRouterProtocol -``` - -## Pipe test implementation steps - -1. Create Stuff from blueprint: - -```python -stuff = StuffFactory.make_stuff( - concept_code="RetrievedExcerpt", - domain="retrieve", - content=RetrievedExcerpt(text="", justification="") - name="retrieved_text", -) -``` - -2. Create Working Memory: - -```python -working_memory = WorkingMemoryFactory.make_from_single_stuff(stuff=stuff) -``` - -3. Run the pipe: - -```python -pipe_output: PipeOutput = await pipe_router.run_pipe( - pipe_code="pipe_name", - pipe_run_params=PipeRunParamsFactory.make_run_params(), - working_memory=working_memory, - job_metadata=JobMetadata(), -) -``` - -4. Log output and generate report: - -```python -pretty_print(pipe_output, title=f"Pipe output") -get_report_delegate().generate_report() -``` - -5. Basic assertions: - -```python -assert pipe_output is not None -assert pipe_output.working_memory is not None -assert pipe_output.main_stuff is not None -``` - -## Test Data Organization - -- If it's not already there, create a `test_data.py` file in the test directory -- Define test cases using `StuffBlueprint`: - -```python -class TestCases: - CASE_BLUEPRINT_1 = StuffBlueprint( - name="test_case_1", - concept_code="domain.ConceptName1", - value="test_value" - ) - CASE_BLUEPRINT_2 = StuffBlueprint( - name="test_case_2", - concept_code="domain.ConceptName2", - value="test_value" - ) - - CASE_BLUEPRINTS: ClassVar[List[Tuple[str, str]]] = [ # topic, blueprint" - ("topic1", CASE_BLUEPRINT_1), - ("topic2", CASE_BLUEPRINT_2), - ] -``` - -Note how we avoid initializing a default mutable value within a class instance, instead we use ClassVar. -Also note that we provide a topic for the test case, which is purely for convenience. - -## Best Practices for Testing - -- Use parametrize for multiple test cases -- Test both success and failure cases -- Verify working memory state -- Check output structure and content -- Use meaningful test case names -- Include docstrings explaining test purpose -- Log outputs for debugging -- Generate reports for cost tracking - -# Test-Driven Development Guide - -This document outlines our test-driven development (TDD) process and the tools available for testing. - -## TDD Cycle - -1. **Write a Test First** -[pytest.mdc](pytest.mdc) - -2. **Write the Code** - - Implement the minimum amount of code needed to pass the test - - Follow the project's coding standards - - Keep it simple - don't write more than needed - -3. **Run Linting and Type Checking** -[coding_standards.mdc](coding_standards.mdc) - -4. **Refactor if needed** -If the code needs refactoring, with the best practices [coding_standards.mdc](coding_standards.mdc) - -5. **Validate tests** - -Remember: The key to TDD is writing the test first and letting it drive your implementation. Always run the full test suite and quality checks before considering a feature complete. - +# Pipelex Coding Rules + ## Guide to write or edit pipelines using the Pipelex language in .plx files - Always first write your "plan" in natural language, then transcribe it in pipelex. -- You should ALWAYS RUN the terminal command `make validate` when you are writing or editing a `.plx` file. It will ensure the pipe is runnable. If not, iterate. +- You should ALWAYS RUN validation when you are writing or editing a `.plx` file. It will ensure the pipe is runnable. If not, iterate. + - For a specific file: `pipelex validate path_to_file.plx` + - For all pipelines: `pipelex validate all` + - **IMPORTANT**: Ensure the Python virtual environment is activated before running `pipelex` commands. For standard installations, the venv is named `.venv` - always check that first. The commands will not work without proper venv activation. - Please use POSIX standard for files. (empty lines, no trailing whitespaces, etc.) ### Pipeline File Naming @@ -945,10 +23,10 @@ A pipeline file has three main sections: #### Domain Statement ```plx -domain = "domain_name" +domain = "domain_code" description = "Description of the domain" # Optional ``` -Note: The domain name usually matches the plx filename for single-file domains. For multi-file domains, use the subdirectory name. +Note: The domain code usually matches the plx filename for single-file domains. For multi-file domains, use the subdirectory name. #### Concept Definitions @@ -963,10 +41,10 @@ ConceptName = "Description of the concept" - Use PascalCase for concept names - Never use plurals (no "Stories", use "Story") - lists are handled implicitly by Pipelex - Avoid circumstantial adjectives (no "LargeText", use "Text") - focus on the essence of what the concept represents -- Don't redefine native concepts (Text, Image, PDF, TextAndImages, Number, Page) +- Don't redefine native concepts (Text, Image, PDF, TextAndImages, Number, Page, JSON) **Native Concepts:** -Pipelex provides built-in native concepts: `Text`, `Image`, `PDF`, `TextAndImages`, `Number`, `Page`. Use these directly or refine them when appropriate. +Pipelex provides built-in native concepts: `Text`, `Image`, `PDF`, `TextAndImages`, `Number`, `Page`, `JSON`. Use these directly or refine them when appropriate. **Refining Native Concepts:** To create a concept that specializes a native concept without adding fields: @@ -984,7 +62,7 @@ For details on how to structure concepts with fields, see the "Structuring Model ### Pipe Base Definition ```plx -[pipe.your_pipe_name] +[pipe.your_pipe_code] type = "PipeLLM" description = "A description of what your pipe does" inputs = { input_1 = "ConceptName1", input_2 = "ConceptName2" } @@ -994,7 +72,7 @@ output = "ConceptName" The pipes will all have at least this base definition. - `inputs`: Dictionary of key being the variable used in the prompts, and the value being the ConceptName. It should ALSO LIST THE INPUTS OF THE INTERMEDIATE STEPS (if PipeSequence) or of the conditional pipes (if PipeCondition). So If you have this error: -`StaticValidationError: missing_input_variable • domain='expense_validator' • pipe='validate_expense' • +`PipeValidationError: missing_input_variable • domain='expense_validator' • pipe='validate_expense' • variable='['invoice']'`` That means that the pipe validate_expense is missing the input `invoice` because one of the subpipe is needing it. @@ -1049,16 +127,16 @@ For concepts with structured fields, define them inline using TOML syntax: description = "A commercial document issued by a seller to a buyer" [concept.Invoice.structure] -invoice_number = "The unique invoice identifier" +invoice_number = "The unique invoice identifier" # This will be optional by default issue_date = { type = "date", description = "The date the invoice was issued", required = true } total_amount = { type = "number", description = "The total invoice amount", required = true } -vendor_name = "The name of the vendor" -line_items = { type = "list", item_type = "text", description = "List of items", required = false } +vendor_name = "The name of the vendor" # This will be optional by default +line_items = { type = "list", item_type = "text", description = "List of items" } ``` **Supported inline field types:** `text`, `integer`, `boolean`, `number`, `date`, `list`, `dict` -**Field properties:** `type`, `description`, `required` (default: true), `default_value`, `choices`, `item_type` (for lists), `key_type` and `value_type` (for dicts) +**Field properties:** `type`, `description`, `required` (default: false), `default_value`, `choices`, `item_type` (for lists), `key_type` and `value_type` (for dicts) **Simple syntax** (creates required text field): ```plx @@ -1067,7 +145,7 @@ field_name = "Field description" **Detailed syntax** (with explicit properties): ```plx -field_name = { type = "text", description = "Field description", required = false, default_value = "default" } +field_name = { type = "text", description = "Field description", default_value = "default" } ``` **3. Python StructuredContent Class (For Advanced Features)** @@ -1393,7 +471,7 @@ The PipeExtract operator is used to extract text and images from an image or a P [pipe.extract_info] type = "PipeExtract" description = "extract the information" -inputs = { document = "PDF" } # or { image = "Image" } if it's an image. This is the only input. +inputs = { document = "Document" } # or { image = "Image" } if it's an image. This is the only input. output = "Page" ``` @@ -1402,7 +480,7 @@ Using Extract Model Settings: [pipe.extract_with_model] type = "PipeExtract" description = "Extract with specific model" -inputs = { document = "PDF" } +inputs = { document = "Document" } output = "Page" model = "base_extract_mistral" # Use predefined extract preset or model alias ``` @@ -1510,15 +588,16 @@ $sales_rep.phone | $sales_rep.email """ ``` -#### Key Parameters +#### Key Parameters (Template Mode) -- `template`: Inline template string (mutually exclusive with template_name) +- `template`: Inline template string (mutually exclusive with template_name and construct) - `template_name`: Name of a predefined template (mutually exclusive with template) - `template_category`: Template type ("llm_prompt", "html", "markdown", "mermaid", etc.) - `templating_style`: Styling options for template rendering - `extra_context`: Additional context variables for template For more control, you can use a nested `template` section instead of the `template` field: + - `template.template`: The template string - `template.category`: Template type - `template.templating_style`: Styling options @@ -1526,9 +605,143 @@ For more control, you can use a nested `template` section instead of the `templa #### Template Variables Use the same variable insertion rules as PipeLLM: + - `@variable` for block insertion (multi-line content) - `$variable` for inline insertion (short text) +#### Construct Mode (for StructuredContent Output) + +PipeCompose can also generate `StructuredContent` objects using the `construct` section. This mode composes field values from fixed values, variable references, templates, or nested structures. + +**When to use construct mode:** + +- You need to output a structured object (not just Text) +- You want to deterministically compose fields from existing data +- No LLM is needed - just data composition and templating + +##### Basic Construct Usage + +```plx +[concept.SalesSummary] +description = "A structured sales summary" + +[concept.SalesSummary.structure] +report_title = { type = "text", description = "Title of the report" } +customer_name = { type = "text", description = "Customer name" } +deal_value = { type = "number", description = "Deal value" } +summary_text = { type = "text", description = "Generated summary text" } + +[pipe.compose_summary] +type = "PipeCompose" +description = "Compose a sales summary from deal data" +inputs = { deal = "Deal" } +output = "SalesSummary" + +[pipe.compose_summary.construct] +report_title = "Monthly Sales Report" +customer_name = { from = "deal.customer_name" } +deal_value = { from = "deal.amount" } +summary_text = { template = "Deal worth $deal.amount with $deal.customer_name" } +``` + +##### Field Composition Methods + +There are four ways to define field values in a construct: + +**1. Fixed Value (literal)** + +Use a literal value directly: + +```plx +[pipe.compose_report.construct] +report_title = "Annual Report" +report_year = 2024 +is_draft = false +``` + +**2. Variable Reference (`from`)** + +Get a value from working memory using a dotted path: + +```plx +[pipe.compose_report.construct] +customer_name = { from = "deal.customer_name" } +total_amount = { from = "order.total" } +street_address = { from = "customer.address.street" } +``` + +**3. Template (`template`)** + +Render a Jinja2 template with variable substitution: + +```plx +[pipe.compose_report.construct] +invoice_number = { template = "INV-$order.id" } +summary = { template = "Deal worth $deal.amount with $deal.customer_name on {{ current_date }}" } +``` + +**4. Nested Construct** + +For nested structures, use a TOML subsection: + +```plx +[pipe.compose_invoice.construct] +invoice_number = { template = "INV-$order.id" } +total = { from = "order.total_amount" } + +[pipe.compose_invoice.construct.billing_address] +street = { from = "customer.address.street" } +city = { from = "customer.address.city" } +country = "France" +``` + +##### Complete Construct Example + +```plx +domain = "invoicing" + +[concept.Address] +description = "A postal address" + +[concept.Address.structure] +street = { type = "text", description = "Street address" } +city = { type = "text", description = "City name" } +country = { type = "text", description = "Country name" } + +[concept.Invoice] +description = "An invoice document" + +[concept.Invoice.structure] +invoice_number = { type = "text", description = "Invoice number" } +total = { type = "number", description = "Total amount" } + +[pipe.compose_invoice] +type = "PipeCompose" +description = "Compose an invoice from order and customer data" +inputs = { order = "Order", customer = "Customer" } +output = "Invoice" + +[pipe.compose_invoice.construct] +invoice_number = { template = "INV-$order.id" } +total = { from = "order.total_amount" } + +[pipe.compose_invoice.construct.billing_address] +street = { from = "customer.address.street" } +city = { from = "customer.address.city" } +country = "France" +``` + +##### Key Parameters (Construct Mode) + +- `construct`: Dictionary mapping field names to their composition rules +- Each field can be: + - A literal value (string, number, boolean) + - A dict with `from` key for variable reference + - A dict with `template` key for template rendering + - A nested dict for nested structures + +**Note:** You must use either `template` or `construct`, not both. They are mutually exclusive. + ### PipeImgGen operator The PipeImgGen operator is used to generate images using AI image generation models. @@ -1742,7 +955,7 @@ Presets are meant to record the choice of an llm with its hyper parameters (temp Examples: ```toml -llm_for_complex_reasoning = { model = "base-claude", temperature = 1 } +llm_to_engineer = { model = "base-claude", temperature = 1 } llm_to_extract_invoice = { model = "claude-3-7-sonnet", temperature = 0.1, max_tokens = "auto" } ``` @@ -1771,7 +984,11 @@ You can override the predefined llm presets by setting them in `.pipelex/inferen --- -ALWAYS RUN `make validate` when you are finished writing pipelines: This checks for errors. If there are errors, iterate until it works. +ALWAYS RUN validation when you are finished writing pipelines: This checks for errors. If there are errors, iterate until it works. +- For a specific bundle/file: `pipelex validate path_to_file.plx` +- For all pipelines: `pipelex validate all` +- Remember: Ensure your Python virtual environment is activated (typically `.venv` for standard installations) before running `pipelex` commands. + Then, create an example file to run the pipeline in the `examples` folder. But don't write documentation unless asked explicitly to. @@ -1826,7 +1043,7 @@ async def extract_gantt(image_url: str) -> GanttChart: # Run the pipe pipe_output = await execute_pipeline( pipe_code="extract_gantt_by_steps", - input_memory={ + inputs={ "gantt_chart_image": { "concept": "gantt.GanttImage", "content": ImageContent(url=image_url), @@ -1865,18 +1082,18 @@ So here are a few concrete examples of calls to execute_pipeline with various wa ## If you assign a string, by default it will be considered as a TextContent. pipe_output = await execute_pipeline( pipe_code="master_advisory_orchestrator", - input_memory={ + inputs={ "user_input": problem_description, }, ) -## Here we have a single input and it's a PDF. -## Because PDFContent is a native concept, we can use it directly as a value, +## Here we have a single input and it's a document. +## Because DocumentContent is a native concept, we can use it directly as a value, ## the system knows what content it corresponds to: pipe_output = await execute_pipeline( pipe_code="power_extractor_dpe", - input_memory={ - "document": PDFContent(url=pdf_url), + inputs={ + "document": DocumentContent(url=pdf_url), }, ) @@ -1884,7 +1101,7 @@ So here are a few concrete examples of calls to execute_pipeline with various wa ## Because ImageContent is a native concept, we can use it directly as a value: pipe_output = await execute_pipeline( pipe_code="fashion_variation_pipeline", - input_memory={ + inputs={ "fashion_photo": ImageContent(url=image_url), }, ) @@ -1894,7 +1111,7 @@ So here are a few concrete examples of calls to execute_pipeline with various wa ## so we must provide it using a dict with the concept and the content: pipe_output = await execute_pipeline( pipe_code="extract_gantt_by_steps", - input_memory={ + inputs={ "gantt_chart_image": { "concept": "gantt.GanttImage", "content": ImageContent(url=image_url), @@ -1906,7 +1123,7 @@ So here are a few concrete examples of calls to execute_pipeline with various wa pipe_output = await execute_pipeline( pipe_code="retrieve_then_answer", dynamic_output_concept_code="contracts.Fees", - input_memory={ + inputs={ "text": load_text_from_path(path=text_path), "question": { "concept": "answer.Question", @@ -1999,82 +1216,4 @@ result_list = pipe_output.main_stuff_as_items(item_type=GanttChart) ``` --- - -## Rules to choose LLM models used in PipeLLMs. - -### LLM Configuration System - -In order to use it in a pipe, an LLM is referenced by its llm_handle (alias) and possibly by an llm_preset. -LLM configurations are managed through the new inference backend system with files located in `.pipelex/inference/`: - -- **Model Deck**: `.pipelex/inference/deck/base_deck.toml` and `.pipelex/inference/deck/overrides.toml` -- **Backends**: `.pipelex/inference/backends.toml` and `.pipelex/inference/backends/*.toml` -- **Routing**: `.pipelex/inference/routing_profiles.toml` - -### LLM Handles - -An llm_handle can be either: -1. **A direct model name** (like "gpt-4o-mini", "claude-3-sonnet") - automatically available for all models loaded by the inference backend system -2. **An alias** - user-defined shortcuts that map to model names, defined in the `[aliases]` section: - -```toml -[aliases] -base-claude = "claude-4.5-sonnet" -base-gpt = "gpt-5" -base-gemini = "gemini-2.5-flash" -base-mistral = "mistral-medium" -``` - -The system first looks for direct model names, then checks aliases if no direct match is found. The system handles model routing through backends automatically. - -### Using an LLM Handle in a PipeLLM - -Here is an example of using an llm_handle to specify which LLM to use in a PipeLLM: - -```plx -[pipe.hello_world] -type = "PipeLLM" -description = "Write text about Hello World." -output = "Text" -model = { model = "gpt-5", temperature = 0.9 } -prompt = """ -Write a haiku about Hello World. -""" -``` - -As you can see, to use the LLM, you must also indicate the temperature (float between 0 and 1) and max_tokens (either an int or the string "auto"). - -### LLM Presets - -Presets are meant to record the choice of an llm with its hyper parameters (temperature and max_tokens) if it's good for a particular task. LLM Presets are skill-oriented. - -Examples: -```toml -llm_for_complex_reasoning = { model = "base-claude", temperature = 1 } -llm_to_extract_invoice = { model = "claude-3-7-sonnet", temperature = 0.1, max_tokens = "auto" } -``` - -The interest is that these presets can be used to set the LLM choice in a PipeLLM, like this: - -```plx -[pipe.extract_invoice] -type = "PipeLLM" -description = "Extract invoice information from an invoice text transcript" -inputs = { invoice_text = "InvoiceText" } -output = "Invoice" -model = "llm_to_extract_invoice" -prompt = """ -Extract invoice information from this invoice: - -The category of this invoice is: $invoice_details.category. - -@invoice_text -""" -``` - -The setting here `model = "llm_to_extract_invoice"` works because "llm_to_extract_invoice" has been declared as an llm_preset in the deck. -You must not use an LLM preset in a PipeLLM that does not exist in the deck. If needed, you can add llm presets. - - -You can override the predefined llm presets by setting them in `.pipelex/inference/deck/overrides.toml`. diff --git a/Makefile b/Makefile index 14dd9dc..c408b98 100644 --- a/Makefile +++ b/Makefile @@ -221,7 +221,7 @@ codex-tests: env gha-tests: env $(call PRINT_TITLE,"Unit testing for github actions") @echo "• Running unit tests for github actions (excluding inference and gha_disabled)" - $(VENV_PYTEST) --exitfirst --quiet -m "not inference and not gha_disabled" || [ $$? = 5 ] + $(VENV_PYTEST) --disable-inference --exitfirst --quiet -m "not inference and not gha_disabled" || [ $$? = 5 ] run-all-tests: env $(call PRINT_TITLE,"Running all unit tests") diff --git a/tests/conftest.py b/tests/conftest.py index 8a69395..72ba15f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,13 +1,18 @@ import logging +from pathlib import Path import pytest from pipelex.cli.cli_factory import make_pipelex_for_cli from pipelex.cli.commands.validate_cmd import do_validate_all_libraries_and_dry_run from pipelex.cli.error_handlers import ErrorContext from pipelex.config import get_config +from pipelex.hub import get_library_manager, set_current_library from pipelex.pipelex import Pipelex from pipelex.system.configuration.config_check import check_is_initialized from pipelex.system.configuration.configs import PipelexConfig +from pipelex.system.runtime import IntegrationMode +from pipelex.test_extras.shared_pytest_plugins import is_inference_disabled_in_pipelex +from pytest import FixtureRequest from rich import print from rich.console import Console from rich.traceback import Traceback @@ -29,12 +34,29 @@ def check_pipelex_initialized(): @pytest.fixture(scope="module", autouse=True) -def reset_pipelex_config_fixture(): +def reset_pipelex_config_fixture(request: FixtureRequest): # Code to run before each test print("\n[magenta]pipelex setup[/magenta]") try: - make_pipelex_for_cli(context=ErrorContext.VALIDATION, library_dirs=PIPELINE_LIBRARY_DIRS) - do_validate_all_libraries_and_dry_run() + disable_inference = is_inference_disabled_in_pipelex(request) + if disable_inference: + # When inference is disabled, use Pipelex.make() directly with disable_inference=True + # This skips gateway terms check and uses mock content generator + Pipelex.make( + integration_mode=IntegrationMode.CI, + disable_inference=True, + library_dirs=PIPELINE_LIBRARY_DIRS, + ) + # Load libraries manually (without validation/dry-run that needs model resolution) + library_manager = get_library_manager() + library_id, _ = library_manager.open_library() + set_current_library(library_id=library_id) + library_dirs_paths = [Path(lib_dir) for lib_dir in PIPELINE_LIBRARY_DIRS] + library_manager.load_libraries(library_id=library_id, library_dirs=library_dirs_paths) + else: + # When inference is enabled, use the CLI factory for proper error handling + make_pipelex_for_cli(context=ErrorContext.VALIDATION, library_dirs=PIPELINE_LIBRARY_DIRS) + do_validate_all_libraries_and_dry_run() config = get_config() assert isinstance(config, PipelexConfig) except Exception as exc: diff --git a/tests/integration/test_basic.py b/tests/integration/test_basic.py index 1019e51..58b5d42 100644 --- a/tests/integration/test_basic.py +++ b/tests/integration/test_basic.py @@ -1,5 +1,6 @@ import asyncio +import pytest from pipelex.hub import get_pipes, get_required_concept from pipelex.pipe_run.dry_run import dry_run_pipes @@ -12,6 +13,7 @@ def test_concept_exists(): assert get_required_concept("swe.OnboardingDocumentation") is not None +@pytest.mark.gha_disabled # Requires model resolution which fails without configured backends def test_dry_run_all_pipes(): """Test that dry_run_all_pipes() runs successfully without errors.""" # This should not raise any exceptions From e991f94ebeabf86af27506ae8d2246c898402b59 Mon Sep 17 00:00:00 2001 From: Louis Choquel Date: Wed, 21 Jan 2026 12:09:16 +0100 Subject: [PATCH 8/8] Use Chicago b2 --- pyproject.toml | 2 +- uv.lock | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 55864f6..c5040bd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,7 +23,7 @@ dependencies = [ ] [tool.uv.sources] -pipelex = { git = "https://github.com/Pipelex/pipelex.git", branch = "pre-release/v0.18.0b1" } +pipelex = { git = "https://github.com/Pipelex/pipelex.git", branch = "pre-release/v0.18.0b2" } [project.optional-dependencies] docs = [ diff --git a/uv.lock b/uv.lock index 3e62176..ea129f6 100644 --- a/uv.lock +++ b/uv.lock @@ -560,7 +560,7 @@ requires-dist = [ { name = "mkdocs-material", marker = "extra == 'docs'", specifier = "==9.6.14" }, { name = "mkdocs-meta-manager", marker = "extra == 'docs'", specifier = "==1.1.0" }, { name = "mypy", marker = "extra == 'dev'", specifier = ">=1.11.2" }, - { name = "pipelex", extras = ["anthropic", "google", "google-genai", "bedrock"], git = "https://github.com/Pipelex/pipelex.git?branch=pre-release%2Fv0.18.0b1" }, + { name = "pipelex", extras = ["anthropic", "google", "google-genai", "bedrock"], git = "https://github.com/Pipelex/pipelex.git?branch=pre-release%2Fv0.18.0b2" }, { name = "pygithub", specifier = "==2.4.0" }, { name = "pyright", marker = "extra == 'dev'", specifier = ">=1.1.405" }, { name = "pytest", marker = "extra == 'dev'", specifier = ">=9.0.1" }, @@ -1894,8 +1894,8 @@ wheels = [ [[package]] name = "pipelex" -version = "0.18.0b1" -source = { git = "https://github.com/Pipelex/pipelex.git?branch=pre-release%2Fv0.18.0b1#c056b6a7015c4c7c96656e4615578277629e8a5e" } +version = "0.18.0b2" +source = { git = "https://github.com/Pipelex/pipelex.git?branch=pre-release%2Fv0.18.0b2#06b33904bd697791e058c7bc5b43454d96839f46" } dependencies = [ { name = "aiofiles" }, { name = "backports-strenum", marker = "python_full_version < '3.11'" },