From 590f61b895c92e462129ec7db2ca056018a96123 Mon Sep 17 00:00:00 2001 From: auxten Date: Tue, 17 Mar 2026 14:36:26 +0800 Subject: [PATCH 1/2] Add chdb skills: DataStore (pandas API) and SQL Add two new skills for chdb, the in-process ClickHouse engine for Python: - chdb-datastore: Pandas-compatible DataStore API. Drop-in pandas replacement with ClickHouse performance, supporting 16+ data sources and cross-source joins. - chdb-sql: Raw ClickHouse SQL API. Covers chdb.query(), Session, DB-API 2.0, parametrized queries, UDFs, streaming, and all ClickHouse table functions. Each skill includes SKILL.md, API references, runnable examples, metadata.json, README.md, and a verify_install.py script. --- AGENTS.md | 39 +- README.md | 39 +- skills/chdb-datastore/README.md | 39 ++ skills/chdb-datastore/SKILL.md | 146 +++++++ skills/chdb-datastore/examples/examples.md | 365 ++++++++++++++++ skills/chdb-datastore/metadata.json | 10 + .../references/api-reference.md | 284 ++++++++++++ .../chdb-datastore/references/connectors.md | 290 +++++++++++++ .../chdb-datastore/scripts/verify_install.py | 100 +++++ skills/chdb-sql/README.md | 40 ++ skills/chdb-sql/SKILL.md | 112 +++++ skills/chdb-sql/examples/examples.md | 404 ++++++++++++++++++ skills/chdb-sql/metadata.json | 10 + skills/chdb-sql/references/api-reference.md | 247 +++++++++++ skills/chdb-sql/references/sql-functions.md | 215 ++++++++++ skills/chdb-sql/references/table-functions.md | 214 ++++++++++ skills/chdb-sql/scripts/verify_install.py | 89 ++++ 17 files changed, 2629 insertions(+), 14 deletions(-) create mode 100644 skills/chdb-datastore/README.md create mode 100644 skills/chdb-datastore/SKILL.md create mode 100644 skills/chdb-datastore/examples/examples.md create mode 100644 skills/chdb-datastore/metadata.json create mode 100644 skills/chdb-datastore/references/api-reference.md create mode 100644 skills/chdb-datastore/references/connectors.md create mode 100644 skills/chdb-datastore/scripts/verify_install.py create mode 100644 skills/chdb-sql/README.md create mode 100644 skills/chdb-sql/SKILL.md create mode 100644 skills/chdb-sql/examples/examples.md create mode 100644 skills/chdb-sql/metadata.json create mode 100644 skills/chdb-sql/references/api-reference.md create mode 100644 skills/chdb-sql/references/sql-functions.md create mode 100644 skills/chdb-sql/references/table-functions.md create mode 100644 skills/chdb-sql/scripts/verify_install.py diff --git a/AGENTS.md b/AGENTS.md index 9bc0378..9a8b710 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -4,22 +4,45 @@ This file provides guidance to AI coding agents (Claude Code, Cursor, Copilot, e ## Repository Overview -A collection of skills for AI agents working with ClickHouse databases. Skills are packaged instructions and guidelines that extend agent capabilities for database design, query optimization, and operational best practices. +A collection of skills for AI agents working with ClickHouse databases and chdb (in-process ClickHouse for Python). Skills are packaged instructions and guidelines that extend agent capabilities for database design, query optimization, operational best practices, and in-process data analytics. ## Repository Structure ``` agent-skills/ ├── skills/ -│ └── clickhouse-best-practices/ # ClickHouse optimization guidelines -│ ├── SKILL.md # Skill definition (overview) -│ ├── AGENTS.md # Full compiled guide (generated) +│ ├── clickhouse-best-practices/ # ClickHouse optimization guidelines +│ │ ├── SKILL.md # Skill definition (overview) +│ │ ├── AGENTS.md # Full compiled guide (generated) +│ │ ├── metadata.json # Version, organization, abstract +│ │ ├── README.md # Maintainer guide +│ │ └── rules/ # Individual rule files +│ │ ├── _sections.md # Section metadata +│ │ ├── _template.md # Template for new rules +│ │ └── *.md # Rule files (e.g., query-use-prewhere.md) +│ ├── chdb-datastore/ # chdb pandas-compatible DataStore API +│ │ ├── SKILL.md # Skill definition and quick-start +│ │ ├── metadata.json # Version, organization, abstract +│ │ ├── README.md # Maintainer guide +│ │ ├── references/ # API reference docs +│ │ │ ├── api-reference.md # DataStore method signatures +│ │ │ └── connectors.md # All data source connection methods +│ │ ├── examples/ +│ │ │ └── examples.md # Runnable examples +│ │ └── scripts/ +│ │ └── verify_install.py # Environment verification +│ └── chdb-sql/ # chdb SQL API +│ ├── SKILL.md # Skill definition and quick-start │ ├── metadata.json # Version, organization, abstract │ ├── README.md # Maintainer guide -│ └── rules/ # Individual rule files -│ ├── _sections.md # Section metadata -│ ├── _template.md # Template for new rules -│ └── *.md # Rule files (e.g., query-use-prewhere.md) +│ ├── references/ # SQL reference docs +│ │ ├── api-reference.md # query/Session/connect signatures +│ │ ├── table-functions.md # ClickHouse table functions +│ │ └── sql-functions.md # Commonly used SQL functions +│ ├── examples/ +│ │ └── examples.md # Runnable examples +│ └── scripts/ +│ └── verify_install.py # Environment verification ├── packages/ │ └── clickhouse-best-practices-build/ # Build tooling │ ├── package.json # Bun scripts diff --git a/README.md b/README.md index 49a1699..07dd6bf 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # ClickHouse Agent Skills -The official Agent Skills for [ClickHouse](https://clickhouse.com/). These skills help LLMs and agents to adopt best practices when working with ClickHouse. +The official Agent Skills for [ClickHouse](https://clickhouse.com/). These skills help LLMs and agents to adopt best practices when working with ClickHouse and [chdb](https://clickhouse.com/docs/chdb) (in-process ClickHouse for Python). You can use these skills with open-source ClickHouse and managed ClickHouse Cloud. [Try ClickHouse Cloud with $300 in free credits](https://clickhouse.com/cloud?utm_medium=github&utm_source=github&utm_ref=agent-skills). @@ -14,9 +14,9 @@ The CLI auto-detects installed agents and prompts you to select where to install ## What is this? -Agent Skills are packaged instructions that extend AI coding agents (Claude Code, Cursor, Copilot, etc.) with domain-specific expertise. This repository provides skills for ClickHouse databases—covering schema design, query optimization, and data ingestion patterns. +Agent Skills are packaged instructions that extend AI coding agents (Claude Code, Cursor, Copilot, etc.) with domain-specific expertise. This repository provides skills for ClickHouse databases and chdb — covering schema design, query optimization, data ingestion patterns, and in-process analytics with Python. -When an agent loads these skills, it gains knowledge of ClickHouse best practices and can apply them while helping you design tables, write queries, or troubleshoot performance issues. +When an agent loads these skills, it gains knowledge of ClickHouse best practices and chdb APIs, and can apply them while helping you design tables, write queries, analyze data, or troubleshoot performance issues. Skills follow the open specification at [agentskills.io](https://agentskills.io). @@ -46,9 +46,25 @@ Skills follow the open specification at [agentskills.io](https://agentskills.io) **For agents:** The skill activates automatically when you work with ClickHouse—creating tables, writing queries, or designing data pipelines. +### chdb DataStore + +**Pandas-compatible API** for chdb — drop-in pandas replacement backed by ClickHouse. Write `import chdb.datastore as pd` and use the same pandas API, 10-100x faster. Supports 16+ data sources (MySQL, PostgreSQL, S3, MongoDB, Iceberg, Delta Lake, etc.) with cross-source joins. + +**Location:** [`skills/chdb-datastore/`](./skills/chdb-datastore/) + +**For agents:** The skill activates when you analyze data with pandas-style syntax, speed up slow pandas code, query remote databases as DataFrames, or join data across different sources. + +### chdb SQL + +**In-process ClickHouse SQL** for Python — run SQL queries on local files, remote databases, and cloud storage without a server. Covers `chdb.query()`, Session, DB-API 2.0, parametrized queries, UDFs, streaming, and all ClickHouse table functions. + +**Location:** [`skills/chdb-sql/`](./skills/chdb-sql/) + +**For agents:** The skill activates when you write SQL queries against files, use ClickHouse table functions, build stateful analytical pipelines, or use advanced ClickHouse SQL features. + ## Quick Start -After installation, your AI agent will reference these best practices when: +After installation, your AI agent will reference these skills when: - Creating new tables with `CREATE TABLE` - Choosing `ORDER BY` / `PRIMARY KEY` columns @@ -57,11 +73,22 @@ After installation, your AI agent will reference these best practices when: - Writing or tuning JOINs - Designing data ingestion pipelines - Handling updates or deletes +- Analyzing data with pandas-style DataStore API +- Querying files or databases with chdb SQL +- Joining data across different sources (MySQL + S3 + local files) -Example prompt: +Example prompts: > "Create a table for storing user events with fields for user_id, event_type, properties (JSON), and timestamp" -The agent will apply relevant rules like proper column ordering in the primary key, appropriate data types, and partitioning strategy. +The agent will apply relevant ClickHouse best practices rules. + +> "Load this Parquet file and group by country, show top 10 by revenue" + +The agent will use chdb DataStore or SQL to query the file directly. + +> "Join my MySQL customers table with this local orders.parquet file" + +The agent will use chdb's cross-source join capabilities. ## Supported Agents diff --git a/skills/chdb-datastore/README.md b/skills/chdb-datastore/README.md new file mode 100644 index 0000000..a4a2b46 --- /dev/null +++ b/skills/chdb-datastore/README.md @@ -0,0 +1,39 @@ +# chdb DataStore + +Agent skill for using chdb's pandas-compatible DataStore API — a drop-in pandas replacement backed by ClickHouse. + +## Installation + +```bash +npx skills add clickhouse/agent-skills +``` + +## What's Included + +| File | Purpose | +|------|---------| +| `SKILL.md` | Skill definition and quick-start guide | +| `references/api-reference.md` | Full DataStore method signatures | +| `references/connectors.md` | All 16+ data source connection methods | +| `examples/examples.md` | 11 runnable examples with expected output | +| `scripts/verify_install.py` | Environment verification script | + +## Trigger Phrases + +This skill activates when you: +- "Analyze this file with pandas" +- "Speed up my pandas code" +- "Query this MySQL/PostgreSQL/S3 table as a DataFrame" +- "Join data from different sources" +- "Use DataStore to..." +- "Import datastore as pd" + +## Related + +- **chdb-sql** — For raw ClickHouse SQL queries, use the `chdb-sql` skill instead +- **clickhouse-best-practices** — For ClickHouse schema/query optimization + +## Documentation + +- [chdb docs](https://clickhouse.com/docs/chdb) +- [chdb GitHub](https://github.com/chdb-io/chdb) diff --git a/skills/chdb-datastore/SKILL.md b/skills/chdb-datastore/SKILL.md new file mode 100644 index 0000000..0694fe3 --- /dev/null +++ b/skills/chdb-datastore/SKILL.md @@ -0,0 +1,146 @@ +--- +name: chdb-datastore +description: >- + Drop-in pandas replacement with ClickHouse performance. Use + `import chdb.datastore as pd` (or `from datastore import DataStore`) + and write standard pandas code — same API, 10-100x faster on large + datasets. Supports 16+ data sources (MySQL, PostgreSQL, S3, MongoDB, + ClickHouse, Iceberg, Delta Lake, etc.) and 10+ file formats (Parquet, + CSV, JSON, Arrow, ORC, etc.) with cross-source joins. Use this skill + when the user wants to analyze data with pandas-style syntax, speed + up slow pandas code, query remote databases or cloud storage as + DataFrames, or join data across different sources — even if they + don't explicitly mention chdb or DataStore. Do NOT use for raw SQL + queries, ClickHouse server administration, or non-Python languages. +license: Apache-2.0 +compatibility: Requires Python 3.9+, macOS or Linux. pip install chdb. +metadata: + author: chdb-io + version: "4.1" + homepage: https://clickhouse.com/docs/chdb +--- + +# chdb DataStore — It's Just Faster Pandas + +## The Key Insight + +```python +# Change this: +import pandas as pd +# To this: +import chdb.datastore as pd +# Everything else stays the same. +``` + +DataStore is a **lazy, ClickHouse-backed pandas replacement**. Your existing pandas code works unchanged — but operations compile to optimized SQL and execute only when results are needed (e.g., `print()`, `len()`, iteration). + +```bash +pip install chdb +``` + +## Decision Tree: Pick the Right Approach + +``` +1. "I have a file/database and want to analyze it with pandas" + → DataStore.from_file() / from_mysql() / from_s3() etc. + → See references/connectors.md + +2. "I need to join data from different sources" + → Create DataStores from each source, use .join() + → See examples/examples.md #3-5 + +3. "My pandas code is too slow" + → import chdb.datastore as pd — change one line, keep the rest + +4. "I need raw SQL queries" + → Use the chdb-sql skill instead +``` + +## Connect to Any Data Source — One Pattern + +```python +from datastore import DataStore + +# Local file (auto-detects .parquet, .csv, .json, .arrow, .orc, .avro, .tsv, .xml) +ds = DataStore.from_file("sales.parquet") + +# Database +ds = DataStore.from_mysql(host="db:3306", database="shop", table="orders", user="root", password="pass") + +# Cloud storage +ds = DataStore.from_s3("s3://bucket/data.parquet", nosign=True) + +# URI shorthand — auto-detects source type +ds = DataStore.uri("mysql://root:pass@db:3306/shop/orders") +``` + +All 16+ sources and URI schemes → [connectors.md](references/connectors.md) + +## After Connecting — Full Pandas API + +```python +result = ds[ds["age"] > 25] # filter +result = ds[["name", "city"]] # select columns +result = ds.sort_values("revenue", ascending=False) # sort +result = ds.groupby("dept")["salary"].mean() # groupby +result = ds.assign(margin=lambda x: x["profit"] / x["revenue"]) # computed column +ds["name"].str.upper() # string accessor +ds["date"].dt.year # datetime accessor +result = ds1.join(ds2, on="id") # join +result = ds.head(10) # preview +print(ds.to_sql()) # see generated SQL +``` + +209 DataFrame methods supported. Full API → [api-reference.md](references/api-reference.md) + +## Cross-Source Join — The Killer Feature + +```python +from datastore import DataStore + +customers = DataStore.from_mysql(host="db:3306", database="crm", table="customers", user="root", password="pass") +orders = DataStore.from_file("orders.parquet") + +result = (orders + .join(customers, left_on="customer_id", right_on="id") + .groupby("country") + .agg({"amount": "sum", "rating": "mean"}) + .sort_values("sum", ascending=False)) +print(result) +``` + +More join examples → [examples.md](examples/examples.md) + +## Writing Data + +```python +source = DataStore.from_mysql(host="db:3306", database="shop", table="orders", user="root", password="pass") +target = DataStore("file", path="summary.parquet", format="Parquet") + +target.insert_into("category", "total", "count").select_from( + source.groupby("category").select("category", "sum(amount) AS total", "count() AS count") +).execute() +``` + +## Troubleshooting + +| Problem | Fix | +|---------|-----| +| `ImportError: No module named 'chdb'` | `pip install chdb` | +| `ImportError: cannot import 'DataStore'` | Use `from datastore import DataStore` or `from chdb.datastore import DataStore` | +| Database connection timeout | Include port in host: `host="db:3306"` not `host="db"` | +| Join returns empty result | Check key types match (both int or both string); use `.to_sql()` to inspect | +| Unexpected results | Call `ds.to_sql()` to see the generated SQL and debug | +| Environment check | Run `python scripts/verify_install.py` (from skill directory) | + +## References + +- [API Reference](references/api-reference.md) — Full DataStore method signatures +- [Connectors](references/connectors.md) — All 16+ data source connection methods +- [Examples](examples/examples.md) — 10+ runnable examples with expected output +- [Verify Install](scripts/verify_install.py) — Environment verification script +- [Official Docs](https://clickhouse.com/docs/chdb) + +> Note: This skill teaches how to *use* chdb DataStore. +> For raw SQL queries, use the `chdb-sql` skill. +> For contributing to chdb source code, see CLAUDE.md in the project root. diff --git a/skills/chdb-datastore/examples/examples.md b/skills/chdb-datastore/examples/examples.md new file mode 100644 index 0000000..4e88cb9 --- /dev/null +++ b/skills/chdb-datastore/examples/examples.md @@ -0,0 +1,365 @@ +# DataStore Examples + +> All examples are self-contained and runnable. +> Expected output is shown in comments. + +## Table of Contents + +1. [Pandas Replacement: One Import Change](#1-pandas-replacement-one-import-change) +2. [Analyze Local Files](#2-analyze-local-files) +3. [Cross-Source Join: MySQL + Parquet](#3-cross-source-join-mysql--parquet) +4. [Cross-Source Join: S3 + PostgreSQL](#4-cross-source-join-s3--postgresql) +5. [Three-Way Join: File + Database + Cloud](#5-three-way-join-file--database--cloud) +6. [Data Lake Formats: Iceberg, Delta, Hudi](#6-data-lake-formats-iceberg-delta-hudi) +7. [URI Shorthand Access](#7-uri-shorthand-access) +8. [Cloud Storage Variants (S3/GCS/Azure/HDFS)](#8-cloud-storage-variants) +9. [Cross-Source Write](#9-cross-source-write) +10. [Explore Remote Schema](#10-explore-remote-schema) +11. [Common Errors & Fixes](#11-common-errors--fixes) + +--- + +## 1. Pandas Replacement: One Import Change + +The simplest way to use chdb — change one line, keep everything else: + +```python +# Before (standard pandas): +# import pandas as pd + +# After (chdb-accelerated): +import chdb.datastore as pd + +df = pd.DataStore({"name": ["Alice", "Bob", "Carol", "Dave"], + "dept": ["Eng", "Sales", "Eng", "Sales"], + "salary": [95000, 72000, 110000, 68000]}) + +# Same pandas API — everything works +result = (df[df["salary"] > 70000] + .groupby("dept") + .agg({"salary": ["mean", "count"]}) + .sort_values("mean", ascending=False)) + +print(result) +# Expected output: +# dept mean count +# 0 Eng 102500 2 +# 1 Sales 72000 1 +``` + +**Why it's faster:** Operations compile to ClickHouse SQL and execute as a single optimized query, instead of step-by-step Python evaluation. + +--- + +## 2. Analyze Local Files + +```python +from datastore import DataStore + +# Parquet — pandas-style analysis +ds = DataStore.from_file("sales.parquet") +top_products = (ds[ds['revenue'] > 0] + .groupby('product') + .agg({'revenue': 'sum', 'quantity': 'sum'}) + .sort_values('revenue', ascending=False) + .head(10)) +print(top_products) + +# CSV with filtering +ds = DataStore.from_file("employees.csv") +senior = ds[(ds['years'] > 5) & (ds['dept'] == 'Engineering')] +print(senior[['name', 'title', 'salary']].sort_values('salary', ascending=False)) + +# Glob pattern — query all matching files at once +ds = DataStore.from_file("logs/2024-*.csv") +errors = ds[ds['level'] == 'ERROR'].groupby('module')['message'].count() +print(errors.sort_values(ascending=False)) + +# See the SQL behind any query +print(top_products.to_sql()) +``` + +--- + +## 3. Cross-Source Join: MySQL + Parquet + +```python +from datastore import DataStore + +customers = DataStore.from_mysql( + host="db:3306", database="crm", table="customers", + user="reader", password="pass") + +orders = DataStore.from_file("orders.parquet") + +result = (customers + .join(orders, left_on="id", right_on="customer_id", how="inner") + .groupby("country") + .agg({"amount": ["sum", "mean"], "order_id": "count"}) + .sort_values("sum", ascending=False)) + +print(result) +# Expected: country-level order summary with total, average, and count + +print(result.to_sql()) +# Shows the cross-source SQL generated by chdb +``` + +--- + +## 4. Cross-Source Join: S3 + PostgreSQL + +```python +from datastore import DataStore + +events = DataStore.from_s3( + "s3://analytics/events/2024-*.parquet", + access_key_id="AKIA...", secret_access_key="secret...") + +profiles = DataStore.from_postgresql( + host="pg.example.com:5432", database="users", + table="profiles", user="analyst", password="pass") + +result = (events + .join(profiles, left_on="user_id", right_on="id") + .filter(events['event_type'] == 'purchase') + .groupby(["country", "age_group"]) + .agg({"amount": "sum", "event_id": "count"}) + .sort_values("sum", ascending=False)) + +print(result) +# Expected: purchase events aggregated by country and age group +``` + +--- + +## 5. Three-Way Join: File + Database + Cloud + +```python +from datastore import DataStore + +products = DataStore.from_file("products.csv") +orders = DataStore.from_mysql( + host="db:3306", database="shop", table="orders", + user="root", password="pass") +reviews = DataStore.from_s3("s3://feedback/reviews.parquet", nosign=True) + +result = (orders + .join(products, left_on="product_id", right_on="id") + .join(reviews, left_on="product_id", right_on="product_id") + .groupby("category") + .agg({"amount": "sum", "rating": "mean", "review_id": "count"}) + .sort_values("sum", ascending=False)) + +print(result) +# Expected: category-level summary combining order amounts, review ratings, and counts +``` + +--- + +## 6. Data Lake Formats: Iceberg, Delta, Hudi + +```python +from datastore import DataStore + +# Apache Iceberg on S3 +ds = DataStore.from_iceberg( + "s3://warehouse/iceberg/events", + access_key_id="KEY", secret_access_key="SECRET") +print(ds.head(10)) + +# Delta Lake +ds = DataStore.from_delta( + "s3://warehouse/delta/transactions", + access_key_id="KEY", secret_access_key="SECRET") +summary = (ds.groupby("category") + .agg({"amount": "sum"}) + .sort_values("sum", ascending=False)) +print(summary) + +# Hudi +ds = DataStore.from_hudi( + "s3://warehouse/hudi/logs", + access_key_id="KEY", secret_access_key="SECRET") +errors = ds[ds['level'] == 'ERROR'] +print(errors.head(20)) +``` + +--- + +## 7. URI Shorthand Access + +```python +from datastore import DataStore + +# One-liner for any source +ds = DataStore.uri("sales.parquet") +ds = DataStore.uri("s3://public-data/dataset.parquet?nosign=true") +ds = DataStore.uri("mysql://root:pass@localhost:3306/shop/orders") +ds = DataStore.uri("postgresql://analyst:pass@pg:5432/analytics/events") +ds = DataStore.uri("clickhouse://ch:9440/analytics/hits?user=reader&password=pass") +ds = DataStore.uri("mongodb://user:pass@mongo:27017/logs.app_events") +ds = DataStore.uri("sqlite:///data/local.db?table=users") +ds = DataStore.uri("deltalake:///data/delta/events") + +# After creating from any source, same pandas API +result = (ds[ds['value'] > 100] + .groupby('category') + .sum() + .sort_values('value', ascending=False)) +print(result) +``` + +--- + +## 8. Cloud Storage Variants + +```python +from datastore import DataStore + +# AWS S3 (private) +ds = DataStore.from_s3("s3://my-bucket/data.parquet", + access_key_id="AKIA...", secret_access_key="secret...") + +# AWS S3 (public) +ds = DataStore.from_s3("s3://public-data/dataset.parquet", nosign=True) + +# Google Cloud Storage +ds = DataStore.from_gcs("gs://my-bucket/data.parquet", + hmac_key="KEY", hmac_secret="SECRET") +ds = DataStore.from_gcs("gs://public-bucket/data.parquet", nosign=True) + +# Azure Blob Storage +ds = DataStore.from_azure( + connection_string="DefaultEndpointsProtocol=https;AccountName=...;AccountKey=...", + container="data", path="analytics/events.parquet") + +# HDFS +ds = DataStore.from_hdfs("hdfs://namenode:9000/warehouse/events/*.parquet") +``` + +--- + +## 9. Cross-Source Write + +```python +from datastore import DataStore + +# Read from MySQL, transform, write to Parquet +source = DataStore.from_mysql( + host="db:3306", database="shop", table="orders", + user="root", password="pass") +target = DataStore("file", path="output/orders_summary.parquet", format="Parquet") + +target.insert_into("category", "total_revenue", "order_count").select_from( + source + .groupby("category") + .select("category", "sum(amount) AS total_revenue", "count() AS order_count") + .filter(source['amount'] > 0) +).execute() + +# Read from S3, filter, write to local file +source = DataStore.from_s3("s3://logs/events.parquet", nosign=True) +target = DataStore("file", path="filtered_events.parquet", format="Parquet") + +target.insert_into("user_id", "event_type", "ts").select_from( + source.select("user_id", "event_type", "ts") + .filter(source['event_type'] == 'error') +).execute() +``` + +--- + +## 10. Explore Remote Schema + +```python +from datastore import DataStore + +# Connect to MySQL and browse the schema +mysql_ds = DataStore.from_mysql( + host="db:3306", database="ecommerce", + user="analyst", password="pass") + +print(mysql_ds.databases()) # list all databases +print(mysql_ds.tables("ecommerce")) # list tables in a database + +# Quick preview of a specific table +orders = DataStore.from_mysql( + host="db:3306", database="ecommerce", + table="orders", user="analyst", password="pass") + +print(orders.columns) # → ['id', 'customer_id', 'amount', ...] +print(orders.dtypes) # → {'id': 'UInt64', 'amount': 'Float64', ...} +print(orders.describe()) # → statistics for numeric columns +print(orders.head(5)) # → first 5 rows +``` + +--- + +## 11. Common Errors & Fixes + +### File not found + +```python +from datastore import DataStore + +# Error: file not found +ds = DataStore.from_file("nonexistent.parquet") +# → Exception: FILE_NOT_FOUND + +# Fix: check the path +import os +print(os.path.exists("nonexistent.parquet")) # → False +ds = DataStore.from_file("data/sales.parquet") # use correct path +``` + +### Database host without port + +```python +# Error: connection timeout / refused +ds = DataStore.from_mysql(host="db", database="shop", table="orders", + user="root", password="pass") +# → Connection refused + +# Fix: include port in host string +ds = DataStore.from_mysql(host="db:3306", database="shop", table="orders", + user="root", password="pass") +``` + +### Join key type mismatch + +```python +# Error: join returns empty or wrong results +users = DataStore({"id": [1, 2, 3], "name": ["Alice", "Bob", "Carol"]}) # id is Int +orders = DataStore({"user_id": ["1", "2", "3"], "amount": [100, 200, 300]}) # user_id is String + +result = users.join(orders, left_on="id", right_on="user_id") +print(result) # → empty or incorrect + +# Fix: ensure matching types — use .to_sql() to diagnose +print(result.to_sql()) # reveals the type mismatch in the JOIN condition +# Cast in source data, or use assign() to convert types before joining +``` + +### Debugging with .to_sql() + +```python +from datastore import DataStore + +ds = DataStore.from_file("sales.parquet") +result = (ds[ds['revenue'] > 1000] + .groupby('product') + .agg({'revenue': 'sum'}) + .sort_values('revenue', ascending=False) + .head(10)) + +# See exactly what SQL will execute +print(result.to_sql()) +# Output: +# SELECT "product", sum("revenue") AS "revenue" +# FROM file('sales.parquet', Parquet) +# WHERE "revenue" > 1000 +# GROUP BY "product" +# ORDER BY "revenue" DESC +# LIMIT 10 +``` diff --git a/skills/chdb-datastore/metadata.json b/skills/chdb-datastore/metadata.json new file mode 100644 index 0000000..61fa2d5 --- /dev/null +++ b/skills/chdb-datastore/metadata.json @@ -0,0 +1,10 @@ +{ + "version": "4.1.0", + "organization": "ClickHouse Inc", + "date": "March 2026", + "abstract": "Pandas-compatible DataStore API for chdb. Drop-in pandas replacement backed by ClickHouse: same API, 10-100x faster. Supports 16+ data sources (MySQL, PostgreSQL, S3, ClickHouse, MongoDB, Iceberg, Delta Lake, etc.) and 10+ file formats with cross-source joins.", + "references": [ + "https://clickhouse.com/docs/chdb", + "https://github.com/chdb-io/chdb" + ] +} diff --git a/skills/chdb-datastore/references/api-reference.md b/skills/chdb-datastore/references/api-reference.md new file mode 100644 index 0000000..5224667 --- /dev/null +++ b/skills/chdb-datastore/references/api-reference.md @@ -0,0 +1,284 @@ +# DataStore API Reference + +> Complete method signatures for the DataStore class. +> DataStore provides a pandas-compatible API backed by ClickHouse. + +## Table of Contents + +- [Import & Construction](#import--construction) +- [Selection & Filtering](#selection--filtering) +- [Sorting & Limiting](#sorting--limiting) +- [GroupBy & Aggregation](#groupby--aggregation) +- [Joins](#joins) +- [Mutation](#mutation) +- [String Accessor (.str)](#string-accessor-str) +- [DateTime Accessor (.dt)](#datetime-accessor-dt) +- [Inspection & Execution Triggers](#inspection--execution-triggers) +- [Writing Data](#writing-data) +- [Configuration](#configuration) + +--- + +## Import & Construction + +```python +from datastore import DataStore +# or: from chdb.datastore import DataStore +# or: import chdb.datastore as pd (drop-in replacement) +``` + +### Constructor + +```python +DataStore(source=None, table=None, database=":memory:", connection=None, **kwargs) +``` + +| Source type | Usage | +|-------------|-------| +| dict | `DataStore({'col1': [1, 2], 'col2': ['a', 'b']})` | +| pd.DataFrame | `DataStore(df)` | +| str (source type) | `DataStore("file", path="data.parquet")` | +| str (source type) | `DataStore("mysql", host="host:3306", database="db", table="t", user="u", password="p")` | + +### Factory Methods + +See [connectors.md](connectors.md) for all factory methods (`from_file`, `from_mysql`, `from_s3`, `uri`, etc.). + +--- + +## Selection & Filtering + +| Expression | Returns | Description | +|------------|---------|-------------| +| `ds['col']` | LazySeries | Single column | +| `ds[['c1', 'c2']]` | DataStore | Multiple columns | +| `ds[condition]` | DataStore | Boolean filter (e.g., `ds[ds['age'] > 25]`) | +| `.select(*fields)` | DataStore | SQL-style SELECT with expressions | +| `.filter(condition)` | DataStore | SQL-style WHERE clause | +| `.where(condition)` | DataStore | Alias for `.filter()` | + +```python +result = ds[ds["age"] > 25] +result = ds[(ds["status"] == "active") & (ds["revenue"] > 1000)] +result = ds[["name", "city", "revenue"]] +result = ds.select("name", "revenue * 1.1 AS adjusted_revenue") +result = ds.filter(ds["country"] == "US") +``` + +--- + +## Sorting & Limiting + +| Method | Description | +|--------|-------------| +| `.sort_values(by, ascending=True)` | Pandas-style sort (by can be str or list) | +| `.sort(*columns, ascending=True)` | SQL-style ORDER BY | +| `.orderby(*columns, ascending=True)` | Alias for `.sort()` | +| `.limit(n)` | LIMIT n rows | +| `.offset(n)` | Skip first n rows | +| `.head(n=5)` | First n rows | +| `.tail(n=5)` | Last n rows | + +```python +result = ds.sort_values("revenue", ascending=False) +result = ds.sort_values(["country", "city"]) +result = ds.head(10) +result = ds.limit(100).offset(50) +``` + +--- + +## GroupBy & Aggregation + +```python +grouped = ds.groupby(*columns) # returns LazyGroupBy +grouped = ds.groupby("dept") +grouped = ds.groupby(["region", "product"]) +``` + +| Method | Description | +|--------|-------------| +| `.agg(func=None, **kwargs)` | Aggregate with named functions | +| `.sum()`, `.mean()`, `.count()`, `.min()`, `.max()` | Single aggregation | +| `.std()`, `.var()` | Standard deviation / variance | +| `.having(condition)` | HAVING clause (after aggregation) | + +```python +result = ds.groupby("dept")["salary"].mean() +result = ds.groupby("dept").agg({"salary": "mean", "bonus": "sum"}) +result = ds.groupby(["region", "product"]).agg( + total_revenue=("revenue", "sum"), + avg_quantity=("quantity", "mean")) +``` + +--- + +## Joins + +```python +.join(other, on=None, how='inner', left_on=None, right_on=None, suffixes=('_x', '_y')) +.merge(other, on=None, how='inner') +``` + +| `how` | Description | +|-------|-------------| +| `'inner'` | Only matching rows (default) | +| `'left'` | All left rows + matching right | +| `'right'` | All right rows + matching left | +| `'outer'` | All rows from both sides | +| `'cross'` | Cartesian product | + +```python +result = orders.join(customers, left_on="customer_id", right_on="id") +result = orders.join(customers, on="customer_id", how="left") +result = ds1.merge(ds2, on="key", how="outer") +``` + +**Cross-source joins work transparently** — join a MySQL table with a Parquet file: + +```python +mysql_ds = DataStore.from_mysql(host="db:3306", database="crm", table="users", user="root", password="pass") +parquet_ds = DataStore.from_file("orders.parquet") +result = mysql_ds.join(parquet_ds, left_on="id", right_on="user_id") +``` + +--- + +## Mutation + +| Method | Description | +|--------|-------------| +| `.assign(**kwargs)` | Add computed columns | +| `.with_column(name, expr)` | Add single column | +| `.drop(columns)` | Remove columns (str or list) | +| `.rename(columns={})` | Rename columns via mapping | +| `.fillna(value)` | Fill NaN/NULL values | +| `.dropna(subset=None)` | Drop rows with NaN/NULL | +| `.distinct(subset=None, keep='first')` | Deduplicate rows | + +```python +result = ds.assign( + profit=ds["revenue"] - ds["cost"], + margin=lambda x: x["profit"] / x["revenue"]) +result = ds.drop("temp_column") +result = ds.rename(columns={"old_name": "new_name"}) +result = ds.fillna(0) +result = ds.dropna(subset=["email", "phone"]) +result = ds.distinct(subset=["user_id"], keep="first") +``` + +--- + +## String Accessor (.str) + +Access via `ds['column'].str.*`. 56 methods available, including: + +| Method | Description | +|--------|-------------| +| `.str.upper()`, `.str.lower()` | Case conversion | +| `.str.strip()`, `.str.lstrip()`, `.str.rstrip()` | Whitespace trimming | +| `.str.contains(pattern)` | Substring/regex match → boolean | +| `.str.startswith(prefix)`, `.str.endswith(suffix)` | Prefix/suffix check | +| `.str.replace(old, new)` | String replacement | +| `.str.split(sep)` | Split into parts | +| `.str.len()` | String length | +| `.str.slice(start, stop)` | Substring extraction | +| `.str.cat(sep=None)` | Concatenation | +| `.str.extract(pattern)` | Regex group extraction | +| `.str.pad(width)`, `.str.zfill(width)` | Padding | +| `.str.match(pattern)` | Full regex match | + +```python +ds["name"].str.upper() +ds["email"].str.contains("@gmail") +ds["code"].str.slice(0, 3) +``` + +--- + +## DateTime Accessor (.dt) + +Access via `ds['column'].dt.*`. 42+ methods available, including: + +| Property/Method | Description | +|-----------------|-------------| +| `.dt.year`, `.dt.month`, `.dt.day` | Date components | +| `.dt.hour`, `.dt.minute`, `.dt.second` | Time components | +| `.dt.dayofweek`, `.dt.dayofyear` | Day ordinals | +| `.dt.quarter` | Quarter (1-4) | +| `.dt.date`, `.dt.time` | Date/time part | +| `.dt.strftime(format)` | Format as string | +| `.dt.floor(freq)`, `.dt.ceil(freq)` | Round to frequency | +| `.dt.tz_localize(tz)`, `.dt.tz_convert(tz)` | Timezone handling | +| `.dt.normalize()` | Reset time to midnight | + +```python +ds["order_date"].dt.year +ds["order_date"].dt.month +ds["timestamp"].dt.hour +ds["created_at"].dt.strftime("%Y-%m-%d") +``` + +--- + +## Inspection & Execution Triggers + +These properties/methods **trigger execution** of the lazy query: + +| Property/Method | Returns | Description | +|-----------------|---------|-------------| +| `.columns` | list | Column names | +| `.shape` | (rows, cols) | Dimensions | +| `.dtypes` | dict | Column types | +| `.head(n=5)` | DataStore | First n rows | +| `.tail(n=5)` | DataStore | Last n rows | +| `.describe()` | DataStore | Summary statistics | +| `.info()` | None | Print DataFrame info | +| `print(ds)` | — | Display results | +| `len(ds)` | int | Row count | +| `for row in ds` | — | Iterate rows | +| `.equals(other)` | bool | Compare DataStores | + +These methods **do not trigger execution**: + +| Method | Returns | Description | +|--------|---------|-------------| +| `.to_sql()` | str | View the generated SQL | +| `.explain()` | str | Execution plan | + +```python +print(ds.columns) # → ['name', 'age', 'city'] +print(ds.shape) # → (1000, 3) +print(ds.to_sql()) # → SELECT ... FROM ... WHERE ... +print(ds.describe()) # → statistics table +``` + +--- + +## Writing Data + +Use the `insert_into` / `select_from` pattern: + +```python +source = DataStore.from_mysql(host="db:3306", database="shop", table="orders", user="root", password="pass") +target = DataStore("file", path="output.parquet", format="Parquet") + +target.insert_into("col1", "col2").select_from( + source.select("col1", "col2").filter(source['value'] > 100) +).execute() +``` + +--- + +## Configuration + +```python +from datastore import config + +config.use_chdb() # force chDB/SQL backend +config.use_pandas() # force pandas backend +config.prefer_chdb() # prefer chDB when possible, fallback to pandas +config.prefer_pandas() # prefer pandas when possible, fallback to chDB +config.enable_debug() # verbose logging (shows generated SQL) +config.enable_profiling() # performance profiling +``` diff --git a/skills/chdb-datastore/references/connectors.md b/skills/chdb-datastore/references/connectors.md new file mode 100644 index 0000000..def5e5e --- /dev/null +++ b/skills/chdb-datastore/references/connectors.md @@ -0,0 +1,290 @@ +# DataStore Connectors — All Data Sources + +> Quick reference for connecting DataStore to any data source. +> After connecting, all sources share the same pandas API. + +## Table of Contents + +- [Local Files](#local-files) +- [Cloud Storage](#cloud-storage) +- [Databases](#databases) +- [Data Lakes](#data-lakes) +- [URI Shorthand](#uri-shorthand) +- [In-Memory Data](#in-memory-data) + +--- + +## Local Files + +```python +DataStore.from_file(path, format=None, structure=None, compression=None, **kwargs) +``` + +Format is auto-detected by extension: `.parquet`, `.csv`, `.tsv`, `.json`, `.jsonl`, `.arrow`, `.orc`, `.avro`, `.xml`. + +```python +from datastore import DataStore + +ds = DataStore.from_file("sales.parquet") +ds = DataStore.from_file("data.csv") +ds = DataStore.from_file("events.jsonl") +ds = DataStore.from_file("logs/*.csv") # glob pattern +ds = DataStore.from_file("data/2024-*/events.parquet") # nested glob +ds = DataStore.from_file("data.csv.gz") # compressed, auto-detected +ds = DataStore.from_file("data.tsv", format="TabSeparatedWithNames") # explicit format +``` + +**Notes:** +- Glob patterns (`*`, `**`) work for querying multiple files at once +- Compression (`.gz`, `.zst`, `.bz2`, `.xz`, `.lz4`) is auto-detected from extension +- Use `structure` parameter to specify column types: `structure="id UInt64, name String"` + +--- + +## Cloud Storage + +### S3 + +```python +DataStore.from_s3(url, access_key_id=None, secret_access_key=None, format=None, nosign=False, **kwargs) +``` + +```python +# Public bucket (no auth) +ds = DataStore.from_s3("s3://public-data/dataset.parquet", nosign=True) + +# Private bucket +ds = DataStore.from_s3("s3://my-bucket/data.parquet", + access_key_id="AKIA...", secret_access_key="secret...") + +# Glob pattern +ds = DataStore.from_s3("s3://bucket/logs/2024-*.parquet", nosign=True) +``` + +### GCS (Google Cloud Storage) + +```python +DataStore.from_gcs(url, hmac_key=None, hmac_secret=None, format=None, nosign=False, **kwargs) +``` + +```python +ds = DataStore.from_gcs("gs://my-bucket/data.parquet", nosign=True) +ds = DataStore.from_gcs("gs://private/data.parquet", hmac_key="KEY", hmac_secret="SECRET") +``` + +### Azure Blob Storage + +```python +DataStore.from_azure(connection_string, container, path="", format=None, **kwargs) +``` + +```python +ds = DataStore.from_azure( + connection_string="DefaultEndpointsProtocol=https;AccountName=...;AccountKey=...", + container="data", path="analytics/events.parquet") +``` + +### HDFS + +```python +DataStore.from_hdfs(uri, format=None, structure=None, **kwargs) +``` + +```python +ds = DataStore.from_hdfs("hdfs://namenode:9000/warehouse/events/*.parquet") +``` + +### HTTP/HTTPS URL + +```python +DataStore.from_url(url, format=None, structure=None, headers=None, **kwargs) +``` + +```python +ds = DataStore.from_url("https://example.com/data.csv") +``` + +--- + +## Databases + +### MySQL + +```python +DataStore.from_mysql(host, database=None, table=None, user=None, password="", port=None, **kwargs) +``` + +```python +ds = DataStore.from_mysql( + host="db.example.com:3306", database="shop", + table="orders", user="root", password="pass") +``` + +**Note:** Port must be included in `host` string (e.g., `"db:3306"`) or passed via `port` parameter. + +### PostgreSQL + +```python +DataStore.from_postgresql(host, database=None, table=None, user=None, password="", port=None, **kwargs) +``` + +```python +ds = DataStore.from_postgresql( + host="pg:5432", database="analytics", + table="events", user="user", password="pass") +``` + +### ClickHouse (Remote) + +```python +DataStore.from_clickhouse(host, database=None, table=None, user="default", password="", secure=False, port=None, **kwargs) +``` + +```python +ds = DataStore.from_clickhouse(host="ch:9000", database="logs", table="access_log") +ds = DataStore.from_clickhouse(host="ch:9440", database="logs", table="hits", + user="reader", password="pass", secure=True) +``` + +### MongoDB + +```python +DataStore.from_mongodb(host, database, collection, user, password="", **kwargs) +``` + +```python +ds = DataStore.from_mongodb( + host="mongo:27017", database="app", + collection="users", user="user", password="pass") +``` + +### SQLite + +```python +DataStore.from_sqlite(database_path, table, **kwargs) +``` + +```python +ds = DataStore.from_sqlite("/data/local.db", "users") +``` + +### Redis + +```python +DataStore.from_redis(host, key, structure, password=None, db_index=0, **kwargs) +``` + +```python +ds = DataStore.from_redis("localhost:6379", key="mydata", + structure="id UInt64, name String, value Float64") +``` + +--- + +## Data Lakes + +### Apache Iceberg + +```python +DataStore.from_iceberg(url, access_key_id=None, secret_access_key=None, **kwargs) +``` + +```python +ds = DataStore.from_iceberg("s3://warehouse/iceberg/events", + access_key_id="KEY", secret_access_key="SECRET") +``` + +### Delta Lake + +```python +DataStore.from_delta(url, access_key_id=None, secret_access_key=None, **kwargs) +``` + +```python +ds = DataStore.from_delta("s3://warehouse/delta/transactions", + access_key_id="KEY", secret_access_key="SECRET") +``` + +### Apache Hudi + +```python +DataStore.from_hudi(url, access_key_id=None, secret_access_key=None, **kwargs) +``` + +```python +ds = DataStore.from_hudi("s3://warehouse/hudi/logs", + access_key_id="KEY", secret_access_key="SECRET") +``` + +--- + +## URI Shorthand + +```python +DataStore.uri(uri_string, **kwargs) +``` + +Universal one-liner that auto-detects source type from the URI scheme: + +| Scheme | Example | +|--------|---------| +| _(path)_ | `sales.parquet`, `/data/file.csv` | +| `file` | `file:///data/file.csv` | +| `s3`, `s3a`, `s3n` | `s3://bucket/key?nosign=true` | +| `gs`, `gcs` | `gs://bucket/path` | +| `az`, `azure`, `wasb` | `az://container/blob?account_name=X&account_key=Y` | +| `hdfs` | `hdfs://namenode:9000/path` | +| `http`, `https` | `https://example.com/data.json` | +| `mysql` | `mysql://user:pass@host:port/db/table` | +| `postgresql`, `postgres` | `postgresql://user:pass@host:port/db/table` | +| `clickhouse` | `clickhouse://host:port/db/table?user=X&password=Y` | +| `mongodb`, `mongo` | `mongodb://user:pass@host:port/db.collection` | +| `sqlite` | `sqlite:///path/to/db.db?table=name` | +| `redis` | `redis://host:port/db?key=mykey&password=pass` | +| `iceberg` | `iceberg://catalog/namespace/table` | +| `deltalake`, `delta` | `deltalake:///path/to/table` | +| `hudi` | `hudi:///path/to/table` | + +```python +from datastore import DataStore + +ds = DataStore.uri("s3://public-data/dataset.parquet?nosign=true") +ds = DataStore.uri("mysql://root:pass@localhost:3306/shop/orders") +ds = DataStore.uri("postgresql://analyst:pass@pg:5432/analytics/events") +ds = DataStore.uri("clickhouse://ch:9440/analytics/hits?user=reader&password=pass") +ds = DataStore.uri("mongodb://user:pass@mongo:27017/logs.app_events") +ds = DataStore.uri("sqlite:///data/local.db?table=users") +ds = DataStore.uri("deltalake:///data/delta/events") +``` + +--- + +## In-Memory Data + +### From dict + +```python +ds = DataStore({"name": ["Alice", "Bob"], "age": [25, 30]}) +``` + +### From pandas DataFrame + +```python +ds = DataStore(df) +ds = DataStore.from_df(df, name="my_data") +``` + +### Generated sequences + +```python +ds = DataStore.from_numbers(100) # 0..99 +ds = DataStore.from_numbers(10, start=5, step=2) # 5, 7, 9, ... +``` + +### Random data (for testing) + +```python +ds = DataStore.from_random( + structure="id UInt64, name String, value Float64", + random_seed=42, max_string_length=10) +``` diff --git a/skills/chdb-datastore/scripts/verify_install.py b/skills/chdb-datastore/scripts/verify_install.py new file mode 100644 index 0000000..260dd55 --- /dev/null +++ b/skills/chdb-datastore/scripts/verify_install.py @@ -0,0 +1,100 @@ +#!/usr/bin/env python3 +"""Verify chdb DataStore installation and basic functionality.""" + +import sys + +PASS = "OK" +FAIL = "FAIL" +results = [] + + +def check(name, fn): + try: + fn() + results.append((name, PASS, "")) + print(f" [{PASS}] {name}") + except Exception as e: + results.append((name, FAIL, str(e))) + print(f" [{FAIL}] {name}: {e}") + + +def check_python_version(): + assert sys.version_info >= (3, 9), f"Python 3.9+ required, got {sys.version}" + + +def check_chdb_import(): + import chdb + assert hasattr(chdb, "__version__"), "chdb imported but missing __version__" + print(f" chdb version: {chdb.__version__}") + + +def check_datastore_import_from_datastore(): + from datastore import DataStore + assert DataStore is not None + + +def check_datastore_import_from_chdb(): + from chdb.datastore import DataStore + assert DataStore is not None + + +def check_datastore_as_pd(): + import chdb.datastore as pd + assert hasattr(pd, "DataStore") + + +def check_basic_operations(): + from datastore import DataStore + + ds = DataStore({"name": ["Alice", "Bob", "Carol"], "age": [25, 30, 35]}) + filtered = ds[ds["age"] > 25] + assert len(filtered) == 2, f"Expected 2 rows, got {len(filtered)}" + + +def check_sort(): + from datastore import DataStore + + ds = DataStore({"name": ["Charlie", "Alice", "Bob"], "value": [3, 1, 2]}) + sorted_ds = ds.sort_values("value") + cols = sorted_ds.columns + assert "name" in cols and "value" in cols, f"Unexpected columns: {cols}" + + +def check_groupby(): + from datastore import DataStore + + ds = DataStore({ + "dept": ["Eng", "Sales", "Eng", "Sales"], + "salary": [100, 80, 120, 90], + }) + result = ds.groupby("dept")["salary"].mean() + assert len(result) == 2, f"Expected 2 groups, got {len(result)}" + + +if __name__ == "__main__": + print("chdb DataStore Installation Verification") + print("=" * 45) + + check("Python version >= 3.9", check_python_version) + check("import chdb", check_chdb_import) + check("from datastore import DataStore", check_datastore_import_from_datastore) + check("from chdb.datastore import DataStore", check_datastore_import_from_chdb) + check("import chdb.datastore as pd", check_datastore_as_pd) + check("Basic filter operation", check_basic_operations) + check("Sort operation", check_sort) + check("GroupBy aggregation", check_groupby) + + print() + print("=" * 45) + passed = sum(1 for _, s, _ in results if s == PASS) + total = len(results) + print(f"Results: {passed}/{total} passed") + + if passed < total: + print("\nFailed checks:") + for name, status, err in results: + if status == FAIL: + print(f" - {name}: {err}") + sys.exit(1) + else: + print("All checks passed!") diff --git a/skills/chdb-sql/README.md b/skills/chdb-sql/README.md new file mode 100644 index 0000000..2d10b96 --- /dev/null +++ b/skills/chdb-sql/README.md @@ -0,0 +1,40 @@ +# chdb SQL + +Agent skill for using chdb's SQL API — run ClickHouse SQL directly in Python without a server. + +## Installation + +```bash +npx skills add clickhouse/agent-skills +``` + +## What's Included + +| File | Purpose | +|------|---------| +| `SKILL.md` | Skill definition with quick-start examples | +| `references/api-reference.md` | chdb.query(), Session, Connection signatures | +| `references/table-functions.md` | All ClickHouse table functions (file, s3, mysql, etc.) | +| `references/sql-functions.md` | Commonly used ClickHouse SQL functions | +| `examples/examples.md` | 9 runnable examples with expected output | +| `scripts/verify_install.py` | Environment verification script | + +## Trigger Phrases + +This skill activates when you: +- "Query this Parquet/CSV file with SQL" +- "Use chdb to run a query" +- "Join MySQL and S3 data with SQL" +- "Create a ClickHouse session" +- "Use ClickHouse table functions" +- "Write a parametrized query" + +## Related + +- **chdb-datastore** — For pandas-style DataFrame operations, use the `chdb-datastore` skill instead +- **clickhouse-best-practices** — For ClickHouse schema/query optimization + +## Documentation + +- [chdb docs](https://clickhouse.com/docs/chdb) +- [chdb GitHub](https://github.com/chdb-io/chdb) diff --git a/skills/chdb-sql/SKILL.md b/skills/chdb-sql/SKILL.md new file mode 100644 index 0000000..7aca9dd --- /dev/null +++ b/skills/chdb-sql/SKILL.md @@ -0,0 +1,112 @@ +--- +name: chdb-sql +description: >- + In-process ClickHouse SQL engine for Python — run ClickHouse SQL queries + directly on local files, remote databases, and cloud storage without a + server. Use when the user wants to write SQL queries against Parquet/CSV/ + JSON files, use ClickHouse table functions (mysql(), s3(), postgresql(), + iceberg(), deltaLake() etc.), build stateful analytical pipelines with + Session, use parametrized queries, window functions, or other advanced + ClickHouse SQL features. Also use when the user explicitly mentions + chdb.query(), ClickHouse SQL syntax, or wants cross-source SQL joins. + Do NOT use for pandas-style DataFrame operations — use chdb-datastore + instead. +license: Apache-2.0 +compatibility: Requires Python 3.9+, macOS or Linux. pip install chdb. +metadata: + author: chdb-io + version: "4.1" + homepage: https://clickhouse.com/docs/chdb +--- + +# chdb SQL — ClickHouse in Your Python Process + +Run ClickHouse SQL directly in Python — no server needed. Query local files, remote databases, and cloud storage with full ClickHouse SQL power. + +```bash +pip install chdb +``` + +## Decision Tree: Pick the Right API + +``` +1. One-off query on files or databases → chdb.query() +2. Multi-step analysis with tables → Session +3. DB-API 2.0 connection → chdb.connect() +4. Pandas-style DataFrame operations → Use chdb-datastore skill instead +``` + +## chdb.query() — One Line, Any Data + +```python +import chdb + +chdb.query("SELECT * FROM file('data.parquet', Parquet) WHERE price > 100 LIMIT 10") # local files +chdb.query("SELECT * FROM mysql('db:3306', 'shop', 'orders', 'root', 'pass')") # databases +chdb.query("SELECT * FROM s3('s3://bucket/data.parquet', NOSIGN) LIMIT 10") # cloud storage +chdb.query("SELECT * FROM deltaLake('s3://bucket/delta/table', NOSIGN) LIMIT 10") # data lakes + +# Cross-source join +chdb.query(""" + SELECT u.name, o.amount FROM mysql('db:3306', 'crm', 'users', 'root', 'pass') AS u + JOIN file('orders.parquet', Parquet) AS o ON u.id = o.user_id ORDER BY o.amount DESC +""") + +data = {"name": ["Alice", "Bob"], "score": [95, 87]} +chdb.query("SELECT * FROM Python(data) ORDER BY score DESC") # Python data +df = chdb.query("SELECT * FROM numbers(10)", "DataFrame") # output formats +chdb.query("SELECT toDate({d:String}) + number FROM numbers({n:UInt64})", + "DataFrame", params={"d": "2025-01-01", "n": 30}) # parametrized +``` + +Table functions → [table-functions.md](references/table-functions.md) | SQL functions → [sql-functions.md](references/sql-functions.md) | Full API → [api-reference.md](references/api-reference.md) + +## Session — Stateful Analysis Pipelines + +```python +from chdb import session as chs +sess = chs.Session("./analytics_db") # persistent; Session() for in-memory + +sess.query("CREATE TABLE users ENGINE=MergeTree() ORDER BY id AS SELECT * FROM mysql('db:3306','crm','users','root','pass')") +sess.query("CREATE TABLE events ENGINE=MergeTree() ORDER BY (ts,user_id) AS SELECT * FROM s3('s3://logs/events/*.parquet',NOSIGN)") +sess.query(""" + SELECT u.country, count() AS cnt, uniqExact(e.user_id) AS users + FROM events e JOIN users u ON e.user_id = u.id + WHERE e.ts >= today() - 7 GROUP BY u.country ORDER BY cnt DESC +""", "Pretty").show() +sess.close() +``` + +## Connection API (DB-API 2.0) + +```python +from chdb import dbapi +conn = dbapi.connect() +cur = conn.cursor() +cur.execute("SELECT * FROM file('data.parquet', Parquet) WHERE value > 100") +print(cur.fetchall()) +cur.close() +conn.close() +``` + +## Troubleshooting + +| Problem | Fix | +|---------|-----| +| `ImportError: No module named 'chdb'` | `pip install chdb` | +| `DB::Exception: FILE_NOT_FOUND` | Check file path; use absolute path or verify cwd | +| `DB::Exception: Unknown table function` | Check function name spelling (e.g., `deltaLake` not `deltalake`) | +| Connection refused to remote DB | Check host:port format; ensure remote DB allows connections | +| Environment check | Run `python scripts/verify_install.py` (from skill directory) | + +## References + +- [API Reference](references/api-reference.md) — query/Session/connect signatures +- [Table Functions](references/table-functions.md) — All ClickHouse table functions +- [SQL Functions](references/sql-functions.md) — Commonly used SQL functions +- [Examples](examples/examples.md) — 9 runnable examples with expected output +- [Official Docs](https://clickhouse.com/docs/chdb) + +> Note: This skill teaches how to *use* chdb SQL. +> For pandas-style operations, use the `chdb-datastore` skill. +> For contributing to chdb source code, see CLAUDE.md in the project root. diff --git a/skills/chdb-sql/examples/examples.md b/skills/chdb-sql/examples/examples.md new file mode 100644 index 0000000..fff4f7a --- /dev/null +++ b/skills/chdb-sql/examples/examples.md @@ -0,0 +1,404 @@ +# chdb SQL Examples + +> All examples are self-contained and runnable. +> Expected output is shown in comments. + +## Table of Contents + +1. [Query Any File](#1-query-any-file) +2. [Cross-Source SQL Joins](#2-cross-source-sql-joins) +3. [Session: Build Analytical Tables](#3-session-build-analytical-tables) +4. [Python Data as SQL Table](#4-python-data-as-sql-table) +5. [Parametrized Queries](#5-parametrized-queries) +6. [Window Functions](#6-window-functions) +7. [User-Defined Functions (UDF)](#7-user-defined-functions-udf) +8. [Streaming Large Results](#8-streaming-large-results) +9. [Common Errors & Fixes](#9-common-errors--fixes) + +--- + +## 1. Query Any File + +```python +import chdb + +# Parquet +result = chdb.query(""" + SELECT country, count() AS cnt + FROM file('users.parquet', Parquet) + GROUP BY country + ORDER BY cnt DESC + LIMIT 10 +""", "Pretty") +result.show() +# Expected: top 10 countries by user count, formatted table + +# CSV +df = chdb.query(""" + SELECT * FROM file('sales.csv', CSVWithNames) + WHERE revenue > 10000 + ORDER BY revenue DESC +""", "DataFrame") +print(df) +# Expected: pandas DataFrame with high-revenue rows + +# JSON Lines +chdb.query(""" + SELECT * FROM file('events.jsonl', JSONEachRow) + WHERE event_type = 'purchase' +""").show() + +# Glob pattern — query all matching files +df = chdb.query(""" + SELECT level, count() AS cnt + FROM file('logs/2024-*.parquet', Parquet) + GROUP BY level + ORDER BY cnt DESC +""", "DataFrame") +print(df) +# Expected: +# level cnt +# 0 INFO 45230 +# 1 WARN 3210 +# 2 ERROR 890 +``` + +--- + +## 2. Cross-Source SQL Joins + +```python +import chdb + +# MySQL + Parquet join +chdb.query(""" + SELECT u.name, u.email, o.product, o.amount + FROM mysql('db:3306', 'crm', 'users', 'root', 'pass') AS u + JOIN file('orders.parquet', Parquet) AS o ON u.id = o.user_id + WHERE o.amount > 100 + ORDER BY o.amount DESC + LIMIT 20 +""", "Pretty").show() + +# S3 + PostgreSQL join +df = chdb.query(""" + SELECT e.event_type, p.country, count() AS cnt + FROM s3('s3://bucket/events.parquet', 'KEY', 'SECRET', 'Parquet') AS e + JOIN postgresql('pg:5432', 'users', 'profiles', 'user', 'pass') AS p + ON e.user_id = p.id + GROUP BY e.event_type, p.country + ORDER BY cnt DESC +""", "DataFrame") +print(df) + +# ClickHouse + local CSV +chdb.query(""" + SELECT r.host, l.status_code, count() AS requests + FROM remote('ch:9000', 'logs', 'access_log', 'default', '') AS r + JOIN file('server_config.csv', CSVWithNames) AS l ON r.host = l.hostname + GROUP BY r.host, l.status_code + ORDER BY requests DESC +""").show() +``` + +--- + +## 3. Session: Build Analytical Tables + +```python +from chdb import session as chs + +sess = chs.Session("./analytics_db") + +# Ingest from multiple external sources into local tables +sess.query(""" + CREATE TABLE users ENGINE = MergeTree() ORDER BY id AS + SELECT * FROM mysql('db:3306', 'crm', 'users', 'root', 'pass') +""") + +sess.query(""" + CREATE TABLE events ENGINE = MergeTree() ORDER BY (ts, user_id) AS + SELECT * FROM s3('s3://logs/events/*.parquet', NOSIGN) +""") + +# Analyze locally — fast iterative queries +result = sess.query(""" + SELECT + u.country, + e.event_type, + count() AS cnt, + uniqExact(e.user_id) AS unique_users + FROM events e + JOIN users u ON e.user_id = u.id + WHERE e.ts >= today() - 7 + GROUP BY u.country, e.event_type + ORDER BY cnt DESC + LIMIT 20 +""", "Pretty") +result.show() +# Expected: formatted table with country, event_type, count, unique users + +# Check table contents +sess.query("SELECT count() FROM users").show() +sess.query("SELECT count() FROM events").show() + +sess.close() +``` + +--- + +## 4. Python Data as SQL Table + +```python +import chdb +import pandas as pd + +# Query a Python dict directly in SQL +scores = {"student": ["Alice", "Bob", "Carol"], "math": [95, 87, 92], "science": [88, 91, 85]} +chdb.query("SELECT student, math + science AS total FROM Python(scores) ORDER BY total DESC").show() +# Expected: +# Alice,183 +# Carol,177 +# Bob,178 + +# Query a pandas DataFrame in SQL +users_df = pd.DataFrame({"id": [1, 2, 3], "name": ["Alice", "Bob", "Carol"]}) +chdb.query(""" + SELECT p.name, o.product, o.amount + FROM Python(users_df) AS p + JOIN file('orders.parquet', Parquet) AS o ON p.id = o.user_id + ORDER BY o.amount DESC +""").show() + +# Use Python data for parametrized lookups +allowed_ids = {"id": [1, 3, 5, 7, 9]} +df = chdb.query(""" + SELECT * FROM file('data.parquet', Parquet) + WHERE id IN (SELECT id FROM Python(allowed_ids)) +""", "DataFrame") +print(df) +``` + +--- + +## 5. Parametrized Queries + +```python +import chdb + +# Date range generation +result = chdb.query( + """ + SELECT + toDate({start:String}) + number AS date, + rand() % 1000 AS value + FROM numbers({days:UInt64}) + """, + "DataFrame", + params={"start": "2025-01-01", "days": 30}) +print(result) +# Expected: DataFrame with 30 rows, date column from 2025-01-01 to 2025-01-30 + +# Filtering with parameters +result = chdb.query( + """ + SELECT * FROM file('events.parquet', Parquet) + WHERE event_type = {event:String} + AND created_at >= {since:String} + ORDER BY created_at DESC + LIMIT {limit:UInt64} + """, + "DataFrame", + params={"event": "purchase", "since": "2025-01-01", "limit": 100}) +print(result) +``` + +--- + +## 6. Window Functions + +```python +import chdb + +# Ranking within groups +chdb.query(""" + SELECT + department, + name, + salary, + rank() OVER (PARTITION BY department ORDER BY salary DESC) AS dept_rank, + salary - avg(salary) OVER (PARTITION BY department) AS diff_from_avg + FROM file('employees.parquet', Parquet) + ORDER BY department, dept_rank +""", "Pretty").show() +# Expected: employees ranked within each department + +# Running totals and moving averages +df = chdb.query(""" + SELECT + date, + revenue, + sum(revenue) OVER (ORDER BY date) AS cumulative_revenue, + avg(revenue) OVER ( + ORDER BY date + ROWS BETWEEN 6 PRECEDING AND CURRENT ROW + ) AS rolling_7d_avg + FROM file('daily_sales.csv', CSVWithNames) + ORDER BY date +""", "DataFrame") +print(df) +# Expected: daily sales with cumulative and 7-day rolling average + +# Top-N per group +df = chdb.query(""" + SELECT * FROM ( + SELECT + category, + product, + sales, + row_number() OVER (PARTITION BY category ORDER BY sales DESC) AS rn + FROM file('products.parquet', Parquet) + ) WHERE rn <= 3 + ORDER BY category, rn +""", "DataFrame") +print(df) +# Expected: top 3 products per category by sales +``` + +--- + +## 7. User-Defined Functions (UDF) + +```python +from chdb.udf import chdb_udf +import chdb + +@chdb_udf() +def fahrenheit_to_celsius(f): + return (f - 32) * 5.0 / 9.0 + +result = chdb.query(""" + SELECT + city, + temp_f, + fahrenheit_to_celsius(temp_f) AS temp_c + FROM file('weather.csv', CSVWithNames) + ORDER BY temp_c DESC + LIMIT 10 +""", "DataFrame") +print(result) + +@chdb_udf() +def classify_age(age): + if age < 18: + return "minor" + elif age < 65: + return "adult" + else: + return "senior" + +chdb.query(""" + SELECT classify_age(age) AS group, count() AS cnt + FROM file('users.parquet', Parquet) + GROUP BY group + ORDER BY cnt DESC +""", "Pretty").show() +``` + +--- + +## 8. Streaming Large Results + +```python +from chdb import session as chs + +sess = chs.Session() + +# Stream results in chunks for memory efficiency +iterator = sess.send_query( + "SELECT * FROM numbers(10000000)", + format="CSV") + +row_count = 0 +for chunk in iterator: + row_count += chunk.count(b'\n') + +print(f"Total rows streamed: {row_count}") +# Expected: Total rows streamed: 10000000 + +sess.close() +``` + +--- + +## 9. Common Errors & Fixes + +### File not found + +```python +import chdb + +# Error: +chdb.query("SELECT * FROM file('missing.parquet', Parquet)") +# → DB::Exception: FILE_NOT_FOUND + +# Fix: verify the file path +import os +print(os.path.exists("missing.parquet")) # → False +# Use absolute path or check current working directory +chdb.query("SELECT * FROM file('/absolute/path/to/data.parquet', Parquet)") +``` + +### Wrong table function name + +```python +# Error: function name is case-sensitive for data lake functions +chdb.query("SELECT * FROM deltalake('s3://bucket/table', NOSIGN)") +# → DB::Exception: Unknown table function deltalake + +# Fix: use camelCase +chdb.query("SELECT * FROM deltaLake('s3://bucket/table', NOSIGN)") +``` + +### Database connection refused + +```python +# Error: missing port or wrong host format +chdb.query("SELECT * FROM mysql('db', 'shop', 'orders', 'root', 'pass')") +# → Connection refused + +# Fix: include port in host string +chdb.query("SELECT * FROM mysql('db:3306', 'shop', 'orders', 'root', 'pass')") +``` + +### Wrong output format + +```python +import chdb + +# Error: format name is case-sensitive +df = chdb.query("SELECT 1", "dataframe") +# → might not return expected type + +# Fix: use exact format name +df = chdb.query("SELECT 1", "DataFrame") # capital D, capital F +``` + +### Debugging queries + +```python +import chdb + +# Use Pretty format to quickly inspect results +chdb.query("SELECT * FROM file('data.parquet', Parquet) LIMIT 5", "Pretty").show() + +# Check column types +chdb.query(""" + SELECT name, toTypeName(name) AS name_type, toTypeName(value) AS value_type + FROM file('data.parquet', Parquet) + LIMIT 1 +""", "Pretty").show() + +# Explain query execution plan +chdb.query("EXPLAIN SELECT * FROM file('data.parquet', Parquet) WHERE x > 100").show() +``` diff --git a/skills/chdb-sql/metadata.json b/skills/chdb-sql/metadata.json new file mode 100644 index 0000000..bda785e --- /dev/null +++ b/skills/chdb-sql/metadata.json @@ -0,0 +1,10 @@ +{ + "version": "4.1.0", + "organization": "ClickHouse Inc", + "date": "March 2026", + "abstract": "In-process ClickHouse SQL engine for Python. Run SQL queries on local files, remote databases, and cloud storage without a server. Covers chdb.query(), Session, DB-API 2.0, parametrized queries, UDFs, streaming, and all ClickHouse table functions.", + "references": [ + "https://clickhouse.com/docs/chdb", + "https://github.com/chdb-io/chdb" + ] +} diff --git a/skills/chdb-sql/references/api-reference.md b/skills/chdb-sql/references/api-reference.md new file mode 100644 index 0000000..b11df79 --- /dev/null +++ b/skills/chdb-sql/references/api-reference.md @@ -0,0 +1,247 @@ +# chdb SQL API Reference + +> Complete signatures for the SQL-oriented chdb APIs. + +## Table of Contents + +- [chdb.query()](#chdbquery) +- [Session](#session) +- [Connection (DB-API 2.0)](#connection-db-api-20) +- [Output Formats](#output-formats) +- [Parametrized Queries](#parametrized-queries) +- [Streaming Queries](#streaming-queries) +- [Progress Callback](#progress-callback) +- [User-Defined Functions (UDF)](#user-defined-functions-udf) +- [AI-Assisted SQL](#ai-assisted-sql) + +--- + +## chdb.query() + +```python +chdb.query(sql, output_format="CSV", path="", udf_path="", params=None) +``` + +| Param | Type | Default | Description | +|-------|------|---------|-------------| +| `sql` | str | _(required)_ | ClickHouse SQL query | +| `output_format` | str | `"CSV"` | Output format (see [Output Formats](#output-formats)) | +| `path` | str | `""` | Database path (empty = in-memory, no state) | +| `udf_path` | str | `""` | Path for UDF scripts | +| `params` | dict | `None` | Named parameters (see [Parametrized Queries](#parametrized-queries)) | + +**Returns:** Result object with: + +| Property/Method | Description | +|-----------------|-------------| +| `.show()` | Print result to stdout | +| `.bytes()` | Raw bytes of the result | +| `.data()` | Result as string | +| `.rows_read` | Number of rows read | +| `.bytes_read` | Number of bytes read | +| `.elapsed` | Query execution time in seconds | + +```python +import chdb + +result = chdb.query("SELECT 1 + 1 AS answer") +result.show() # prints: 2 +print(result.data()) # "2\n" + +df = chdb.query("SELECT * FROM numbers(10)", "DataFrame") +print(df) # pandas DataFrame +``` + +--- + +## Session + +```python +from chdb import session as chs + +sess = chs.Session(path=":memory:") # in-memory (no persistence) +sess = chs.Session(path="./mydb") # persistent to disk +``` + +| Method | Signature | Description | +|--------|-----------|-------------| +| `query()` | `(sql, fmt="CSV", params=None)` | Execute SQL with session state | +| `send_query()` | `(sql, format="CSV")` | Streaming query (returns iterator) | +| `close()` | `()` | Close session and release resources | + +```python +from chdb import session as chs + +sess = chs.Session("./analytics") + +sess.query("CREATE TABLE t1 (id UInt64, name String) ENGINE = MergeTree() ORDER BY id") +sess.query("INSERT INTO t1 VALUES (1, 'Alice'), (2, 'Bob')") +result = sess.query("SELECT * FROM t1", "Pretty") +result.show() + +sess.close() +``` + +**Key differences from `chdb.query()`:** +- Session maintains state: tables, databases, and settings persist across calls +- Persistent sessions (`path="./dir"`) survive process restarts +- In-memory sessions (`path=":memory:"`) are discarded on close + +--- + +## Connection (DB-API 2.0) + +```python +from chdb import dbapi + +conn = dbapi.connect() # or: dbapi.connect(path="./mydb") +``` + +| Method | Description | +|--------|-------------| +| `conn.cursor()` | Create a cursor | +| `cur.execute(sql)` | Execute SQL | +| `cur.execute(sql, params)` | Execute with parameters | +| `cur.fetchone()` | Fetch one row | +| `cur.fetchmany(size)` | Fetch `size` rows | +| `cur.fetchall()` | Fetch all rows | +| `cur.description` | Column metadata | +| `cur.close()` | Close cursor | +| `conn.close()` | Close connection | + +```python +from chdb import dbapi + +conn = dbapi.connect() +cur = conn.cursor() +cur.execute("SELECT number, number * 2 AS doubled FROM numbers(5)") +print(cur.fetchall()) +# [(0, 0), (1, 2), (2, 4), (3, 6), (4, 8)] +cur.close() +conn.close() +``` + +--- + +## Output Formats + +| Format | Description | Use case | +|--------|-------------|----------| +| `"CSV"` | Comma-separated (default) | General export | +| `"CSVWithNames"` | CSV with header row | Spreadsheet import | +| `"JSON"` | JSON object with metadata | API responses | +| `"JSONEachRow"` | One JSON object per line | Streaming / NDJSON | +| `"DataFrame"` | pandas DataFrame | Python analysis | +| `"Arrow"` | Apache Arrow bytes | IPC format | +| `"ArrowTable"` | pyarrow.Table | Arrow ecosystem | +| `"Parquet"` | Parquet bytes | File export | +| `"Pretty"` | Formatted table | Terminal display | +| `"PrettyCompact"` | Compact table | Terminal display | +| `"TabSeparated"` | TSV | Tab-delimited export | +| `"Debug"` | Debug info | Troubleshooting | + +```python +import chdb + +chdb.query("SELECT 1", "Pretty").show() # formatted table +df = chdb.query("SELECT * FROM numbers(5)", "DataFrame") # pandas DataFrame +arrow = chdb.query("SELECT 1", "ArrowTable") # pyarrow Table +``` + +--- + +## Parametrized Queries + +Use `{name:Type}` placeholders in SQL, and pass values via `params`: + +```python +import chdb + +result = chdb.query( + """ + SELECT toDate({start:String}) + number AS date, rand() % 1000 AS value + FROM numbers({days:UInt64}) + """, + "DataFrame", + params={"start": "2025-01-01", "days": 30}) +print(result) +``` + +Supported types: `String`, `UInt8`–`UInt64`, `Int8`–`Int64`, `Float32`, `Float64`, `Date`, `DateTime`. + +--- + +## Streaming Queries + +For large results, use `send_query` on a Session to get an iterator: + +```python +from chdb import session as chs + +sess = chs.Session() +iterator = sess.send_query("SELECT * FROM numbers(1000000)", format="CSV") +for chunk in iterator: + print(chunk[:100]) # process each chunk +sess.close() +``` + +--- + +## Progress Callback + +Monitor query progress: + +```python +import chdb + +def on_progress(progress): + print(f"Rows: {progress.read_rows}, Bytes: {progress.read_bytes}") + +chdb.query("SELECT * FROM numbers(10000000)", "CSV", progress_callback=on_progress) +``` + +--- + +## User-Defined Functions (UDF) + +Register Python functions as SQL UDFs using the `@chdb_udf` decorator: + +```python +from chdb.udf import chdb_udf + +@chdb_udf() +def my_multiply(x, y): + return x * y + +import chdb +result = chdb.query("SELECT my_multiply(number, 10) FROM numbers(5)", "DataFrame") +print(result) +``` + +**Limitations:** +- UDFs execute in-process, not distributed +- Arguments and return values must be scalar types +- Performance may be lower than native ClickHouse functions for large datasets + +--- + +## AI-Assisted SQL + +Generate SQL queries from natural language: + +```python +import chdb + +sql = chdb.generate_sql("top 10 countries by revenue from orders.parquet") +print(sql) +# SELECT country, sum(revenue) AS total_revenue +# FROM file('orders.parquet', Parquet) +# GROUP BY country +# ORDER BY total_revenue DESC +# LIMIT 10 + +result = chdb.ask("What are the top products by sales?", data="sales.parquet") +print(result) +``` + +**Note:** These features require an LLM API key configured via environment variables. diff --git a/skills/chdb-sql/references/sql-functions.md b/skills/chdb-sql/references/sql-functions.md new file mode 100644 index 0000000..2e232b2 --- /dev/null +++ b/skills/chdb-sql/references/sql-functions.md @@ -0,0 +1,215 @@ +# ClickHouse SQL Functions Quick Reference + +> Commonly used SQL functions available in chdb. +> For the full list, see [ClickHouse documentation](https://clickhouse.com/docs/en/sql-reference/functions). + +## Table of Contents + +- [Aggregate Functions](#aggregate-functions) +- [String Functions](#string-functions) +- [Date & Time Functions](#date--time-functions) +- [Type Conversion](#type-conversion) +- [Conditional Functions](#conditional-functions) +- [Array Functions](#array-functions) +- [JSON Functions](#json-functions) +- [Window Functions](#window-functions) + +--- + +## Aggregate Functions + +| Function | Description | Example | +|----------|-------------|---------| +| `count()` | Row count | `SELECT count() FROM t` | +| `count(col)` | Non-null count | `SELECT count(email) FROM users` | +| `sum(col)` | Sum | `SELECT sum(amount) FROM orders` | +| `avg(col)` | Average | `SELECT avg(salary) FROM employees` | +| `min(col)`, `max(col)` | Min/Max | `SELECT min(price), max(price) FROM products` | +| `uniqExact(col)` | Exact distinct count | `SELECT uniqExact(user_id) FROM events` | +| `uniq(col)` | Approximate distinct count (faster) | `SELECT uniq(user_id) FROM events` | +| `groupArray(col)` | Collect values into array | `SELECT dept, groupArray(name) FROM emp GROUP BY dept` | +| `quantile(level)(col)` | Quantile | `SELECT quantile(0.95)(latency) FROM requests` | +| `quantiles(0.5, 0.9, 0.99)(col)` | Multiple quantiles | `SELECT quantiles(0.5, 0.9, 0.99)(duration)` | +| `median(col)` | Median (= quantile(0.5)) | `SELECT median(age) FROM users` | +| `stddevPop(col)` | Population std dev | `SELECT stddevPop(value) FROM measurements` | +| `varPop(col)` | Population variance | `SELECT varPop(value) FROM measurements` | +| `argMax(col, val)` | Value of col at max val | `SELECT argMax(name, score) FROM students` | +| `argMin(col, val)` | Value of col at min val | `SELECT argMin(name, score) FROM students` | +| `topK(N)(col)` | Most frequent N values | `SELECT topK(10)(search_term) FROM queries` | + +--- + +## String Functions + +| Function | Description | Example | +|----------|-------------|---------| +| `lower(s)` | Lowercase | `SELECT lower('Hello')` → `'hello'` | +| `upper(s)` | Uppercase | `SELECT upper('Hello')` → `'HELLO'` | +| `trim(s)` | Remove whitespace | `SELECT trim(' hi ')` → `'hi'` | +| `length(s)` | String length | `SELECT length('hello')` → `5` | +| `substring(s, offset, length)` | Extract substring | `SELECT substring('hello', 1, 3)` → `'hel'` | +| `concat(a, b, ...)` | Concatenate | `SELECT concat(first, ' ', last)` | +| `like(s, pattern)` | LIKE match | `WHERE like(email, '%@gmail.com')` | +| `match(s, pattern)` | Regex match | `WHERE match(url, '^https?://')` | +| `extract(s, pattern)` | Regex extract | `SELECT extract(url, '://([^/]+)')` | +| `replaceAll(s, from, to)` | Replace all occurrences | `SELECT replaceAll(text, '\n', ' ')` | +| `replaceOne(s, from, to)` | Replace first occurrence | `SELECT replaceOne(s, 'old', 'new')` | +| `splitByChar(sep, s)` | Split string to array | `SELECT splitByChar(',', 'a,b,c')` | +| `splitByString(sep, s)` | Split by substring | `SELECT splitByString('::', path)` | +| `format(template, ...)` | Format string | `SELECT format('{} - {}', name, dept)` | +| `reverse(s)` | Reverse string | `SELECT reverse('hello')` → `'olleh'` | +| `base64Encode(s)` | Base64 encode | `SELECT base64Encode('hello')` | +| `base64Decode(s)` | Base64 decode | `SELECT base64Decode(encoded)` | + +--- + +## Date & Time Functions + +| Function | Description | Example | +|----------|-------------|---------| +| `today()` | Current date | `WHERE date = today()` | +| `now()` | Current datetime | `SELECT now()` | +| `toDate(x)` | Convert to Date | `SELECT toDate('2025-01-15')` | +| `toDateTime(x)` | Convert to DateTime | `SELECT toDateTime('2025-01-15 10:30:00')` | +| `toYear(d)` | Extract year | `SELECT toYear(order_date)` | +| `toMonth(d)` | Extract month | `SELECT toMonth(order_date)` | +| `toDayOfWeek(d)` | Day of week (1=Mon) | `SELECT toDayOfWeek(date)` | +| `toDayOfYear(d)` | Day of year | `SELECT toDayOfYear(date)` | +| `toHour(dt)` | Extract hour | `SELECT toHour(timestamp)` | +| `toMinute(dt)` | Extract minute | `SELECT toMinute(timestamp)` | +| `dateDiff(unit, d1, d2)` | Date difference | `SELECT dateDiff('day', start, end)` | +| `dateAdd(unit, n, d)` | Add to date | `SELECT dateAdd('month', 1, today())` | +| `dateSub(unit, n, d)` | Subtract from date | `SELECT dateSub('day', 7, today())` | +| `formatDateTime(dt, fmt)` | Format datetime | `SELECT formatDateTime(now(), '%Y-%m-%d %H:%M')` | +| `toStartOfMonth(d)` | First day of month | `SELECT toStartOfMonth(date)` | +| `toStartOfWeek(d)` | First day of week | `SELECT toStartOfWeek(date)` | +| `toStartOfHour(dt)` | Truncate to hour | `SELECT toStartOfHour(timestamp)` | +| `toMonday(d)` | Previous Monday | `SELECT toMonday(date)` | + +**Date units for dateDiff/dateAdd/dateSub:** `'second'`, `'minute'`, `'hour'`, `'day'`, `'week'`, `'month'`, `'quarter'`, `'year'`. + +--- + +## Type Conversion + +| Function | Description | Example | +|----------|-------------|---------| +| `toInt32(x)` | Convert to Int32 | `SELECT toInt32('42')` | +| `toUInt64(x)` | Convert to UInt64 | `SELECT toUInt64(id)` | +| `toFloat64(x)` | Convert to Float64 | `SELECT toFloat64('3.14')` | +| `toString(x)` | Convert to String | `SELECT toString(123)` | +| `CAST(x AS Type)` | SQL-style cast | `SELECT CAST(price AS Decimal(10,2))` | +| `toFixedString(s, n)` | Fixed-length string | `SELECT toFixedString(code, 3)` | +| `toDecimal64(x, s)` | Decimal with scale | `SELECT toDecimal64(price, 2)` | +| `parseDateTimeBestEffort(s)` | Smart datetime parse | `SELECT parseDateTimeBestEffort('Jan 15 2025')` | +| `toTypeName(x)` | Get type name | `SELECT toTypeName(column)` | + +--- + +## Conditional Functions + +| Function | Description | Example | +|----------|-------------|---------| +| `if(cond, then, else)` | Ternary | `SELECT if(age >= 18, 'adult', 'minor')` | +| `multiIf(c1,v1, c2,v2, ..., default)` | Multi-branch | `SELECT multiIf(x>100,'high', x>50,'mid', 'low')` | +| `CASE WHEN ... THEN ... END` | SQL CASE | `CASE WHEN status=1 THEN 'active' ELSE 'inactive' END` | +| `coalesce(a, b, ...)` | First non-null | `SELECT coalesce(nickname, name, 'Unknown')` | +| `nullIf(a, b)` | NULL if a=b | `SELECT nullIf(value, 0)` | +| `ifNull(x, alt)` | Replace NULL | `SELECT ifNull(email, 'no-email')` | +| `isNull(x)` | Check NULL | `WHERE isNull(deleted_at)` | +| `isNotNull(x)` | Check not NULL | `WHERE isNotNull(email)` | + +--- + +## Array Functions + +| Function | Description | Example | +|----------|-------------|---------| +| `arrayJoin(arr)` | Expand array to rows | `SELECT arrayJoin([1, 2, 3])` | +| `length(arr)` | Array length | `SELECT length(tags)` | +| `arrayMap(f, arr)` | Transform elements | `SELECT arrayMap(x -> x * 2, [1, 2, 3])` | +| `arrayFilter(f, arr)` | Filter elements | `SELECT arrayFilter(x -> x > 1, [1, 2, 3])` | +| `arrayExists(f, arr)` | Any element matches | `WHERE arrayExists(x -> x = 'admin', roles)` | +| `arrayAll(f, arr)` | All elements match | `WHERE arrayAll(x -> x > 0, scores)` | +| `arraySort(arr)` | Sort array | `SELECT arraySort([3, 1, 2])` → `[1, 2, 3]` | +| `arrayDistinct(arr)` | Unique elements | `SELECT arrayDistinct(tags)` | +| `arrayConcat(a, b)` | Merge arrays | `SELECT arrayConcat([1, 2], [3, 4])` | +| `has(arr, elem)` | Contains element | `WHERE has(tags, 'important')` | +| `indexOf(arr, elem)` | Find element index | `SELECT indexOf(arr, 'target')` | +| `arraySlice(arr, offset, length)` | Sub-array | `SELECT arraySlice(arr, 1, 3)` | + +--- + +## JSON Functions + +| Function | Description | Example | +|----------|-------------|---------| +| `JSONExtract(json, key, Type)` | Extract typed value | `SELECT JSONExtract(data, 'age', 'Int32')` | +| `JSONExtractString(json, key)` | Extract as string | `SELECT JSONExtractString(data, 'name')` | +| `JSONExtractInt(json, key)` | Extract as integer | `SELECT JSONExtractInt(data, 'count')` | +| `JSONExtractFloat(json, key)` | Extract as float | `SELECT JSONExtractFloat(data, 'price')` | +| `JSONExtractBool(json, key)` | Extract as boolean | `SELECT JSONExtractBool(data, 'active')` | +| `JSONExtractArrayRaw(json, key)` | Extract array as strings | `SELECT JSONExtractArrayRaw(data, 'tags')` | +| `simpleJSONExtractString(json, key)` | Fast string extract (flat JSON) | `SELECT simpleJSONExtractString(log, 'level')` | +| `JSONHas(json, key)` | Key exists | `WHERE JSONHas(data, 'email')` | +| `JSONLength(json, key)` | Array/object length | `SELECT JSONLength(data, 'items')` | +| `JSONType(json, key)` | Value type | `SELECT JSONType(data, 'value')` | + +**Nested access:** Use path syntax: `JSONExtractString(data, 'user', 'address', 'city')` + +--- + +## Window Functions + +Window functions compute values across a set of rows related to the current row. + +### Syntax + +```sql +function() OVER ( + [PARTITION BY col1, col2, ...] + [ORDER BY col1 [ASC|DESC], ...] + [ROWS|RANGE BETWEEN ... AND ...] +) +``` + +### Ranking Functions + +| Function | Description | +|----------|-------------| +| `row_number()` | Sequential number (no ties) | +| `rank()` | Rank with gaps for ties | +| `dense_rank()` | Rank without gaps | +| `ntile(n)` | Distribute into n buckets | + +```sql +SELECT name, dept, salary, + row_number() OVER (PARTITION BY dept ORDER BY salary DESC) AS rn, + rank() OVER (ORDER BY salary DESC) AS overall_rank +FROM employees +``` + +### Value Functions + +| Function | Description | +|----------|-------------| +| `lag(col, offset, default)` | Previous row value | +| `lead(col, offset, default)` | Next row value | +| `first_value(col)` | First value in window | +| `last_value(col)` | Last value in window | + +```sql +SELECT date, revenue, + lag(revenue, 1, 0) OVER (ORDER BY date) AS prev_revenue, + revenue - lag(revenue, 1, 0) OVER (ORDER BY date) AS daily_change +FROM daily_sales +``` + +### Aggregate as Window + +```sql +SELECT date, revenue, + sum(revenue) OVER (ORDER BY date) AS cumulative, + avg(revenue) OVER (ORDER BY date ROWS BETWEEN 6 PRECEDING AND CURRENT ROW) AS rolling_7d +FROM daily_sales +``` diff --git a/skills/chdb-sql/references/table-functions.md b/skills/chdb-sql/references/table-functions.md new file mode 100644 index 0000000..552c40d --- /dev/null +++ b/skills/chdb-sql/references/table-functions.md @@ -0,0 +1,214 @@ +# ClickHouse Table Functions for chdb + +> Table functions let you query external data sources directly in SQL. +> Use them with `chdb.query()` or inside a `Session`. + +## Table of Contents + +- [File Sources](#file-sources) +- [Cloud Storage](#cloud-storage) +- [Databases](#databases) +- [Data Lakes](#data-lakes) +- [Utility Functions](#utility-functions) + +--- + +## File Sources + +### file() + +Query local files. Format is auto-detected from extension or specified explicitly. + +```sql +SELECT * FROM file('data.parquet', Parquet) +SELECT * FROM file('data.csv', CSVWithNames) +SELECT * FROM file('events.jsonl', JSONEachRow) +SELECT * FROM file('logs/*.parquet', Parquet) -- glob pattern +SELECT * FROM file('data/2024-*/events.csv', CSVWithNames) -- nested glob +``` + +**Parameters:** `file(path [, format [, structure [, compression]]])` + +Supported formats: `Parquet`, `CSVWithNames`, `CSV`, `TSVWithNames`, `JSONEachRow`, `JSON`, `Arrow`, `ORC`, `Avro`, `XMLWithNames`. + +Supported compression: auto-detected from extension (`.gz`, `.zst`, `.bz2`, `.xz`, `.lz4`). + +--- + +## Cloud Storage + +### s3() + +```sql +-- Public (no auth) +SELECT * FROM s3('s3://bucket/path.parquet', NOSIGN) + +-- With credentials +SELECT * FROM s3('s3://bucket/path.parquet', 'ACCESS_KEY', 'SECRET_KEY', 'Parquet') + +-- Glob pattern +SELECT * FROM s3('s3://bucket/logs/2024-*.parquet', 'KEY', 'SECRET', 'Parquet') +``` + +**Parameters:** `s3(url [, NOSIGN | access_key, secret_key] [, format [, structure [, compression]]])` + +### gcs() + +```sql +SELECT * FROM gcs('gs://bucket/data.parquet', NOSIGN) +SELECT * FROM gcs('gs://bucket/data.parquet', 'HMAC_KEY', 'HMAC_SECRET', 'Parquet') +``` + +**Parameters:** Same as `s3()`. + +### azureBlobStorage() + +```sql +SELECT * FROM azureBlobStorage( + 'DefaultEndpointsProtocol=https;AccountName=...;AccountKey=...', + 'container', 'path/data.parquet', 'Parquet') +``` + +**Parameters:** `azureBlobStorage(connection_string, container, path [, format [, structure [, compression]]])` + +### hdfs() + +```sql +SELECT * FROM hdfs('hdfs://namenode:9000/warehouse/data.parquet', 'Parquet') +SELECT * FROM hdfs('hdfs://namenode:9000/logs/*.parquet', 'Parquet') +``` + +**Parameters:** `hdfs(uri [, format [, structure [, compression]]])` + +--- + +## Databases + +### mysql() + +```sql +SELECT * FROM mysql('host:3306', 'database', 'table', 'user', 'password') + +-- With WHERE pushdown +SELECT * FROM mysql('db:3306', 'shop', 'orders', 'root', 'pass') +WHERE status = 'shipped' AND amount > 100 +``` + +**Parameters:** `mysql(host:port, database, table, user, password)` + +**Note:** Port is part of the host string (e.g., `'db:3306'`), not a separate parameter. + +### postgresql() + +```sql +SELECT * FROM postgresql('host:5432', 'database', 'table', 'user', 'password') + +SELECT * FROM postgresql('pg:5432', 'analytics', 'events', 'analyst', 'pass') +ORDER BY created_at DESC LIMIT 100 +``` + +**Parameters:** `postgresql(host:port, database, table, user, password)` + +### remote() / remoteSecure() + +Query a remote ClickHouse server: + +```sql +SELECT * FROM remote('host:9000', 'database', 'table', 'user', 'password') +SELECT * FROM remoteSecure('host:9440', 'database', 'table', 'user', 'password') +``` + +**Parameters:** `remote(host:port, database, table [, user [, password]])` + +### mongodb() + +```sql +SELECT * FROM mongodb('host:27017', 'database', 'collection', 'user', 'password') +``` + +**Parameters:** `mongodb(host:port, database, collection, user, password)` + +### sqlite() + +```sql +SELECT * FROM sqlite('/path/to/database.db', 'table_name') +``` + +**Parameters:** `sqlite(database_path, table)` + +--- + +## Data Lakes + +### iceberg() + +```sql +SELECT * FROM iceberg('s3://bucket/iceberg/table', 'ACCESS_KEY', 'SECRET_KEY') +SELECT * FROM iceberg('s3://bucket/iceberg/table', NOSIGN) +``` + +**Parameters:** `iceberg(url [, NOSIGN | access_key, secret_key] [, format])` + +### deltaLake() + +```sql +SELECT * FROM deltaLake('s3://bucket/delta/table', 'ACCESS_KEY', 'SECRET_KEY') +SELECT * FROM deltaLake('s3://bucket/delta/table', NOSIGN) +``` + +**Parameters:** `deltaLake(url [, NOSIGN | access_key, secret_key])` + +**Note:** Function name is `deltaLake` (camelCase), not `deltalake`. + +### hudi() + +```sql +SELECT * FROM hudi('s3://bucket/hudi/table', 'ACCESS_KEY', 'SECRET_KEY') +SELECT * FROM hudi('s3://bucket/hudi/table', NOSIGN) +``` + +**Parameters:** `hudi(url [, NOSIGN | access_key, secret_key])` + +--- + +## Utility Functions + +### numbers() + +Generate a sequence of numbers (useful for testing and date generation): + +```sql +SELECT * FROM numbers(100) -- 0 to 99 +SELECT * FROM numbers(10, 100) -- 10 to 109 +SELECT toDate('2025-01-01') + number AS date FROM numbers(365) -- date range +``` + +**Parameters:** `numbers([offset,] count)` + +### Python() + +Use a Python dict or DataFrame as a SQL table: + +```python +import chdb + +data = {"name": ["Alice", "Bob"], "score": [95, 87]} +chdb.query("SELECT * FROM Python(data) ORDER BY score DESC") + +import pandas as pd +df = pd.DataFrame({"id": [1, 2, 3], "value": [10, 20, 30]}) +chdb.query("SELECT * FROM Python(df) WHERE value > 15") +``` + +**Note:** The Python variable must be in scope when the query executes. + +### url() + +Query data from an HTTP/HTTPS URL: + +```sql +SELECT * FROM url('https://example.com/data.csv', CSVWithNames) +SELECT * FROM url('https://api.example.com/data.json', JSONEachRow) +``` + +**Parameters:** `url(url, format [, structure])` diff --git a/skills/chdb-sql/scripts/verify_install.py b/skills/chdb-sql/scripts/verify_install.py new file mode 100644 index 0000000..fe99398 --- /dev/null +++ b/skills/chdb-sql/scripts/verify_install.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 +"""Verify chdb SQL installation and basic functionality.""" + +import sys + +PASS = "OK" +FAIL = "FAIL" +results = [] + + +def check(name, fn): + try: + fn() + results.append((name, PASS, "")) + print(f" [{PASS}] {name}") + except Exception as e: + results.append((name, FAIL, str(e))) + print(f" [{FAIL}] {name}: {e}") + + +def check_python_version(): + assert sys.version_info >= (3, 9), f"Python 3.9+ required, got {sys.version}" + + +def check_chdb_import(): + import chdb + assert hasattr(chdb, "__version__"), "chdb imported but missing __version__" + print(f" chdb version: {chdb.__version__}") + + +def check_basic_query(): + import chdb + result = chdb.query("SELECT 1 + 1 AS answer") + data = result.data() + assert "2" in data, f"Expected '2' in output, got: {data!r}" + + +def check_dataframe_output(): + import chdb + df = chdb.query("SELECT number FROM numbers(5)", "DataFrame") + assert len(df) == 5, f"Expected 5 rows, got {len(df)}" + assert "number" in df.columns, f"Expected 'number' column, got {list(df.columns)}" + + +def check_session(): + from chdb import session as chs + sess = chs.Session() + sess.query("CREATE TABLE _verify_test (id UInt64) ENGINE = Memory") + sess.query("INSERT INTO _verify_test VALUES (1), (2), (3)") + result = sess.query("SELECT count() AS cnt FROM _verify_test") + data = result.data() + assert "3" in data, f"Expected '3' in output, got: {data!r}" + sess.close() + + +def check_parametrized(): + import chdb + result = chdb.query( + "SELECT {x:UInt64} + {y:UInt64} AS sum", + params={"x": 10, "y": 20}) + data = result.data() + assert "30" in data, f"Expected '30' in output, got: {data!r}" + + +if __name__ == "__main__": + print("chdb SQL Installation Verification") + print("=" * 40) + + check("Python version >= 3.9", check_python_version) + check("import chdb", check_chdb_import) + check("Basic query (SELECT 1+1)", check_basic_query) + check("DataFrame output format", check_dataframe_output) + check("Session create + query", check_session) + check("Parametrized query", check_parametrized) + + print() + print("=" * 40) + passed = sum(1 for _, s, _ in results if s == PASS) + total = len(results) + print(f"Results: {passed}/{total} passed") + + if passed < total: + print("\nFailed checks:") + for name, status, err in results: + if status == FAIL: + print(f" - {name}: {err}") + sys.exit(1) + else: + print("All checks passed!") From 5ec47aef563b947c269638f33506b4bcd0c129c3 Mon Sep 17 00:00:00 2001 From: auxten Date: Tue, 17 Mar 2026 21:24:32 +0800 Subject: [PATCH 2/2] Address review feedback: fix .where() docs and strengthen sort test - Fix .where() description: it follows pandas semantics (masks non-matching values with NaN) rather than being an alias for .filter() - Add .where() usage example showing correct behavior - Add sorted values assertion to verify_install.py check_sort() --- skills/chdb-datastore/references/api-reference.md | 3 ++- skills/chdb-datastore/scripts/verify_install.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/skills/chdb-datastore/references/api-reference.md b/skills/chdb-datastore/references/api-reference.md index 5224667..a3708d0 100644 --- a/skills/chdb-datastore/references/api-reference.md +++ b/skills/chdb-datastore/references/api-reference.md @@ -55,7 +55,7 @@ See [connectors.md](connectors.md) for all factory methods (`from_file`, `from_m | `ds[condition]` | DataStore | Boolean filter (e.g., `ds[ds['age'] > 25]`) | | `.select(*fields)` | DataStore | SQL-style SELECT with expressions | | `.filter(condition)` | DataStore | SQL-style WHERE clause | -| `.where(condition)` | DataStore | Alias for `.filter()` | +| `.where(condition)` | DataStore | Mask values where condition is False (pandas semantics) | ```python result = ds[ds["age"] > 25] @@ -63,6 +63,7 @@ result = ds[(ds["status"] == "active") & (ds["revenue"] > 1000)] result = ds[["name", "city", "revenue"]] result = ds.select("name", "revenue * 1.1 AS adjusted_revenue") result = ds.filter(ds["country"] == "US") +result = ds.where(ds["age"] > 25) # keeps all rows; non-matching values become NaN ``` --- diff --git a/skills/chdb-datastore/scripts/verify_install.py b/skills/chdb-datastore/scripts/verify_install.py index 260dd55..b32a782 100644 --- a/skills/chdb-datastore/scripts/verify_install.py +++ b/skills/chdb-datastore/scripts/verify_install.py @@ -57,7 +57,8 @@ def check_sort(): ds = DataStore({"name": ["Charlie", "Alice", "Bob"], "value": [3, 1, 2]}) sorted_ds = ds.sort_values("value") cols = sorted_ds.columns - assert "name" in cols and "value" in cols, f"Unexpected columns: {cols}" + assert "name" in cols and "value" in cols, f"Missing expected columns: {cols}" + assert list(sorted_ds["value"]) == [1, 2, 3], f"Expected sorted values [1, 2, 3], got {list(sorted_ds['value'])}" def check_groupby():