From ff1fe63e13076529b0a26bb8408bee192f7c08f0 Mon Sep 17 00:00:00 2001 From: zuolingxuan Date: Mon, 30 Mar 2026 16:55:27 +0800 Subject: [PATCH 1/3] Reorganize core contracts and Python ecosystem --- AGENTS.md | 11 +- BUILD.bazel | 141 +++++- README-zh.md | 460 +++++++++-------- README.md | 447 ++++++++-------- docs/core-boundary.md | 154 ++++++ docs/local_vector_search_v01.md | 29 ++ docs/runtime-contract.md | 254 ++++++++++ docs/streaming_runtime_design.md | 4 +- python_api/BUILD.bazel | 87 +++- python_api/README.md | 315 ++++++++---- .../{ => benchmarks}/bench_arrow_ingestion.py | 0 python_api/examples/README.md | 14 + .../{ => examples}/demo_batch_sql_arrow.py | 0 .../demo_bitable_group_by_owner.py | 0 python_api/{ => examples}/demo_stream_sql.py | 0 python_api/examples/demo_vector_search.py | 59 +++ python_api/experimental/README.md | 11 + python_api/tests/test_python_cli_contract.py | 84 +++ python_api/tests/test_streaming_v05.py | 5 + python_api/tests/test_vector_search.py | 53 ++ python_api/velaria/custom_stream.py | 6 +- python_api/velaria_cli.py | 76 +-- scripts/BUILD.bazel | 6 +- scripts/build_dashboard_frontend.sh | 46 -- scripts/run_actor_rpc_e2e.sh | 26 +- scripts/run_actor_rpc_scheduler.sh | 8 +- scripts/run_core_regression.sh | 12 + scripts/run_experimental_regression.sh | 11 + scripts/run_python_ci_checks.sh | 25 +- scripts/run_python_ecosystem_regression.sh | 52 ++ scripts/run_vector_search_benchmark.sh | 81 +++ src/dataflow/core/csv.cc | 6 + src/dataflow/core/value.h | 14 +- src/dataflow/examples/actor_rpc_scheduler.cc | 9 - .../examples/vector_search_benchmark.cc | 30 +- src/dataflow/examples/velaria_cli.cc | 139 ++--- src/dataflow/python/python_module.cc | 10 + src/dataflow/runner/actor_runtime.cc | 479 ------------------ src/dataflow/runner/actor_runtime.h | 2 - src/dataflow/runner/dashboard/app.ts | 445 ---------------- src/dataflow/runner/dashboard/index.html | 189 ------- src/dataflow/tests/stream_runtime_test.cc | 75 +++ src/dataflow/tests/vector_runtime_test.cc | 46 ++ 43 files changed, 1964 insertions(+), 1957 deletions(-) create mode 100644 docs/core-boundary.md create mode 100644 docs/runtime-contract.md rename python_api/{ => benchmarks}/bench_arrow_ingestion.py (100%) create mode 100644 python_api/examples/README.md rename python_api/{ => examples}/demo_batch_sql_arrow.py (100%) rename python_api/{ => examples}/demo_bitable_group_by_owner.py (100%) rename python_api/{ => examples}/demo_stream_sql.py (100%) create mode 100644 python_api/examples/demo_vector_search.py create mode 100644 python_api/experimental/README.md create mode 100644 python_api/tests/test_python_cli_contract.py delete mode 100755 scripts/build_dashboard_frontend.sh create mode 100755 scripts/run_core_regression.sh create mode 100755 scripts/run_experimental_regression.sh create mode 100755 scripts/run_python_ecosystem_regression.sh create mode 100755 scripts/run_vector_search_benchmark.sh delete mode 100644 src/dataflow/runner/dashboard/app.ts delete mode 100644 src/dataflow/runner/dashboard/index.html diff --git a/AGENTS.md b/AGENTS.md index 64bdac1..d6af52a 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -47,7 +47,7 @@ ### 调度执行模型 - scheduler 负责接入、快照记录、分发与状态收集,不在本地执行 SQL。 -- dashboard/客户端提交均应走 worker 执行链路。 +- 客户端提交均应走 worker 执行链路。 ### 当前调度策略(v1) @@ -78,10 +78,9 @@ bazel run //:sql_demo bazel run //:df_demo bazel run //:stream_demo bazel run //:actor_rpc_smoke -./scripts/run_actor_rpc_scheduler.sh -- --listen 127.0.0.1:61000 --node-id scheduler --dashboard-enabled --dashboard-listen 127.0.0.1:8080 +./scripts/run_actor_rpc_scheduler.sh -- --listen 127.0.0.1:61000 --node-id scheduler bazel run //:actor_rpc_worker -- --connect 127.0.0.1:61000 --node-id worker-1 bazel run //:actor_rpc_client -- --connect 127.0.0.1:61000 --payload "demo payload" -bazel build //:dashboard_app_js ``` ### 一次 build/smoke 摘要 @@ -133,12 +132,6 @@ bazel run //:actor_rpc_client -- --connect 127.0.0.1:61000 --payload "demo paylo ./scripts/run_actor_rpc_e2e.sh --payload "demo payload" ``` -可选:通过 dashboard 启动同一运行链路: - -```bash -./scripts/run_actor_rpc_scheduler.sh -- --listen 127.0.0.1:61000 --node-id scheduler --dashboard-enabled --dashboard-listen 127.0.0.1:8080 -``` - 通过标准: - scheduler 输出 `[scheduler] listen 127.0.0.1:61000` - worker 输出 `[worker] connected 127.0.0.1:61000` diff --git a/BUILD.bazel b/BUILD.bazel index 6d5b51c..9b9418c 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -1,5 +1,105 @@ load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library", "cc_test") +filegroup( + name = "velaria_core_logical_sources", + srcs = [ + "src/dataflow/planner/plan.cc", + "src/dataflow/planner/plan.h", + "src/dataflow/sql/sql_ast.h", + "src/dataflow/sql/sql_errors.h", + "src/dataflow/sql/sql_parser.cc", + "src/dataflow/sql/sql_parser.h", + "src/dataflow/sql/sql_planner.cc", + "src/dataflow/sql/sql_planner.h", + ], + visibility = ["//visibility:public"], +) + +filegroup( + name = "velaria_core_execution_sources", + srcs = [ + "src/dataflow/core/csv.cc", + "src/dataflow/core/csv.h", + "src/dataflow/core/table.cc", + "src/dataflow/core/table.h", + "src/dataflow/core/value.cc", + "src/dataflow/core/value.h", + "src/dataflow/runtime/executor.cc", + "src/dataflow/runtime/executor.h", + "src/dataflow/runtime/job_master.cc", + "src/dataflow/runtime/job_master.h", + "src/dataflow/runtime/observability.h", + "src/dataflow/runtime/rpc_contract.h", + "src/dataflow/runtime/vector_index.cc", + "src/dataflow/runtime/vector_index.h", + "src/dataflow/serial/serializer.cc", + "src/dataflow/serial/serializer.h", + "src/dataflow/stream/binary_row_batch.cc", + "src/dataflow/stream/binary_row_batch.h", + "src/dataflow/stream/stream.cc", + "src/dataflow/stream/stream.h", + ], + visibility = ["//visibility:public"], +) + +filegroup( + name = "velaria_core_contract_sources", + srcs = [ + "src/dataflow/api/dataframe.cc", + "src/dataflow/api/dataframe.h", + "src/dataflow/api/session.cc", + "src/dataflow/api/session.h", + "src/dataflow/catalog/catalog.cc", + "src/dataflow/catalog/catalog.h", + "src/dataflow/stream/source_sink_abi.h", + ], + visibility = ["//visibility:public"], +) + +filegroup( + name = "velaria_python_ecosystem_sources", + srcs = [ + "//python_api:velaria_python_supported_sources", + "//python_api:velaria_python_example_sources", + "//python_api:velaria_python_experimental_sources", + "src/dataflow/python/python_module.cc", + ], + visibility = ["//visibility:public"], +) + +filegroup( + name = "velaria_experimental_sources", + srcs = [ + "src/dataflow/ai/plugin_runtime.cc", + "src/dataflow/ai/plugin_runtime.h", + "src/dataflow/rpc/actor_rpc_codec.cc", + "src/dataflow/rpc/actor_rpc_codec.h", + "src/dataflow/rpc/rpc_codec.cc", + "src/dataflow/rpc/rpc_codec.h", + "src/dataflow/rpc/serialization.cc", + "src/dataflow/rpc/serialization.h", + "src/dataflow/runner/actor_runtime.cc", + "src/dataflow/runner/actor_runtime.h", + "src/dataflow/runtime/actor_runtime.cc", + "src/dataflow/runtime/actor_runtime.h", + "src/dataflow/runtime/byte_transport.cc", + "src/dataflow/runtime/byte_transport.h", + "src/dataflow/runtime/rpc_runner.cc", + "src/dataflow/runtime/rpc_runner.h", + "src/dataflow/stream/actor_stream_runtime.cc", + "src/dataflow/stream/actor_stream_runtime.h", + "src/dataflow/transport/ipc_transport.cc", + "src/dataflow/transport/ipc_transport.h", + ], + visibility = ["//visibility:public"], +) + +filegroup( + name = "velaria_examples_sources", + srcs = glob(["src/dataflow/examples/*.cc"]), + visibility = ["//visibility:public"], +) + cc_library( name = "dataflow_core", srcs = [ @@ -56,6 +156,32 @@ cc_library( visibility = ["//visibility:public"], ) +test_suite( + name = "core_regression", + tests = [ + ":planner_v03_test", + ":source_sink_abi_test", + ":sql_regression_test", + ":stream_runtime_test", + ":stream_strategy_explain_test", + ":vector_runtime_test", + ], +) + +test_suite( + name = "python_ecosystem_regression", + tests = [ + "//python_api:velaria_python_supported_regression", + ], +) + +test_suite( + name = "experimental_regression", + tests = [ + ":stream_actor_credit_test", + ], +) + cc_binary( name = "wordcount_demo", srcs = ["src/dataflow/examples/wordcount.cc"], @@ -195,24 +321,9 @@ cc_library( cc_binary( name = "actor_rpc_scheduler", srcs = ["src/dataflow/examples/actor_rpc_scheduler.cc"], - data = [ - ":dashboard_app_js", - "src/dataflow/runner/dashboard/index.html", - ], deps = [":dataflow_actor_runner"], ) -genrule( - name = "dashboard_app_js", - srcs = [ - "src/dataflow/runner/dashboard/app.ts", - "//scripts:build_dashboard_frontend.sh", - ], - outs = ["src/dataflow/runner/dashboard/app.js"], - cmd = "$(location //scripts:build_dashboard_frontend.sh) $(location src/dataflow/runner/dashboard/app.ts) $@", - tools = ["//scripts:build_dashboard_frontend.sh"], -) - cc_binary( name = "actor_rpc_worker", srcs = ["src/dataflow/examples/actor_rpc_worker.cc"], diff --git a/README-zh.md b/README-zh.md index 08152d5..31d515c 100644 --- a/README-zh.md +++ b/README-zh.md @@ -1,185 +1,194 @@ -# Velaria:纯 C++17 Streaming-First 数据流内核 +# Velaria:纯 C++17 本地数据流内核 `README-zh.md` 是中文镜像文档,对应英文主文档位于 [README.md](./README.md)。后续修改必须保持这两份文件结构和语义同步。 -Velaria 是一个本地优先的 C++17 数据流引擎研究项目。当前目标刻意收敛:先把单机流式主链路稳定下来,让 batch 和 stream 落在同一个执行模型里,再谨慎扩展到本机多进程执行,而不是宣称已经完成分布式运行时。 +Velaria 是一个本地优先的 C++17 数据流引擎研究项目。仓库现在围绕“一个内核 + 两个非内核层”组织: -## 它当前是什么 +- `Core Kernel` + - 本地执行语义 + - batch + stream 共用一个模型 + - 稳定的 explain / progress / checkpoint contract +- `Python Ecosystem` + - 正式支持 Arrow / wheel / CLI / `uv` / Excel / Bitable / custom stream adapter + - 向外投影内核能力,但不进入热路径 +- `Experimental Runtime` + - 同机 `actor/rpc/jobmaster` + - 用于执行与观测研究,不是第二套内核 -当前主执行路线是: +## 黄金路径 -`micro-batch in-proc + query-local backpressure + local worker scale-up` - -仓库里同时保留 `actor + rpc + jobmaster` 作为同机多进程实验路径。这个路径用于执行和观测研究,不代表已经具备完整分布式调度、故障恢复、状态迁移或资源治理能力。 - -核心公开对象: - -- `DataflowSession` -- `DataFrame` -- `StreamingDataFrame` -- `StreamingQuery` - -核心本地执行链: +唯一黄金路径是: ```text -source -> StreamingDataFrame/operator chain -> sink +Arrow / CSV / Python ingress + -> DataflowSession / DataFrame / StreamingDataFrame + -> local runtime kernel + -> sink + -> explain / progress / checkpoint ``` -同机实验链: - -```text -client -> scheduler(jobmaster) -> worker -> in-proc operator chain -> result -``` - -## 当前能力边界 - -当前已具备: - -- batch + streaming 共用一个本地执行框架 -- `read_csv`, `readStream(...)`, `readStreamCsvDir(...)` -- query-local 反压、有界 backlog、progress snapshot、checkpoint path -- 执行模式:`single-process`、`local-workers`、`actor-credit`、`auto` -- 本地文件 source/sink -- 基础流式算子:`select / filter / withColumn / drop / limit / window` -- stateful `sum` 和 `count` -- 建立在现有 streaming operators 上的最小 stream SQL -- Python Arrow 输入/输出 -- 同机 actor/rpc/jobmaster smoke 链路 +公开 session 入口: -当前明确不做: +- `DataflowSession` -- 宣称完成 distributed runtime -- 把 Python callback 拉进热路径 -- Python UDF -- Python sink callback 直接进入 native sink ABI -- 把 `actor-credit` 扩成通用计划并行化 -- 横向扩很多 SQL 面,例如完整 `JOIN / CTE / subquery / UNION / streaming AVG/MIN/MAX` +核心对外对象: -## Streaming 运行时 Contract +- `DataFrame` +- `StreamingDataFrame` +- `StreamingQuery` -主要 streaming 入口: +同机 actor/rpc 路径仍保留在仓库里,但不再作为主叙事。 -- `session.readStream(source)` -- `session.readStreamCsvDir(path)` -- `session.streamSql(sql)` -- `session.explainStreamSql(sql, options)` -- `session.startStreamSql(sql, options)` -- `StreamingDataFrame.writeStream(sink, options)` +## 仓库分层 -### Progress 与 Strategy +### Core Kernel -`StreamingQueryProgress` 和 `snapshotJson()` 会暴露: +Core 负责: -- 执行选择:`execution_mode`、`execution_reason`、`transport_mode` -- 工作量估算:`estimated_state_size_bytes`、`estimated_batch_cost` -- source/sink 状态:`source_is_bounded`、`sink_is_blocking` -- 流控计数:backlog、inflight、blocked、水位字段 -- checkpoint 字段:`checkpoint_delivery_mode`、`last_source_offset` +- logical planning 与最小 SQL 映射 +- table/value 执行模型 +- 本地 batch 与 streaming runtime +- source/sink ABI +- runtime contract surface +- 本地 vector search 能力 -`explainStreamSql(...)` 返回三段: +仓库入口: -- `logical` -- `physical` -- `strategy` +- 文档: + - [docs/core-boundary.md](./docs/core-boundary.md) + - [docs/runtime-contract.md](./docs/runtime-contract.md) + - [docs/streaming_runtime_design.md](./docs/streaming_runtime_design.md) +- Bazel source group: + - `//:velaria_core_logical_sources` + - `//:velaria_core_execution_sources` + - `//:velaria_core_contract_sources` +- 回归套件: + - `//:core_regression` -其中 `strategy` 会解释 selected mode、fallback reason、actor 热路径命中与否、transport、batch/state 估算,以及 actor/shared-memory 决策参数。 +### Python Ecosystem -### 反压 +Python 是正式支持的生态层,不是顺手附带的 wrapper。 -当前反压语义是 query-local 且有界: +它包括: -- `backlog` 表示 pull 之后、drain 之前的队列 batch 数 -- `blocked_count` 统计 producer 进入 wait 的事件次数,不统计循环轮数 -- `max_backlog_batches` 表示 enqueue 后观测到的最大 backlog -- `inflight_batches` 和 `inflight_partitions` 表示尚未消费的排队工作量 -- sink 变慢、state finalize 变慢、或局部分区压力过大,都会反馈到同一个 query-local backlog 计数 +- `python_api` 里的 native binding +- Arrow 输入与输出 +- `uv` 工作流 +- wheel / native wheel / CLI 打包 +- Excel 与 Bitable 适配 +- custom source / custom sink adapter +- `python_api/velaria_cli.py` 里的正式 CLI 工具入口 +- `python_api/examples` 里的 Python 生态 demo +- `python_api/benchmarks` 里的 Python benchmark -延迟字段定义: +它不定义: -- `last_batch_latency_ms`:从 batch 开始执行到 sink flush 完成 -- `last_sink_latency_ms`:sink write + flush -- `last_state_latency_ms`:state/window finalize,stateless batch 为 `0` +- 执行热路径行为 +- 独立的 progress/checkpoint 语义 +- 独立的 vector-search 语义 -### Checkpoint 与 Resume +仓库入口: -checkpoint 文件是本地文件,并采用原子替换写入。 +- 文档: + - [python_api/README.md](./python_api/README.md) +- Bazel source group: + - `//:velaria_python_ecosystem_sources` +- Python 层 source group: + - `//python_api:velaria_python_supported_sources` + - `//python_api:velaria_python_example_sources` + - `//python_api:velaria_python_experimental_sources` +- 回归套件: + - `//:python_ecosystem_regression` +- Python 层回归套件: + - `//python_api:velaria_python_supported_regression` +- shell 入口: + - `./scripts/run_python_ecosystem_regression.sh` -当前交付语义: +### Experimental Runtime -- 默认 `at-least-once`:不恢复 source offset;允许 replay,sink 允许重复输出 -- `best-effort`:仅当 source 实现了 `restoreOffsetToken(...)` 时恢复 offset;仍然不是 exactly-once sink 交付 +Experimental runtime 包括: -当前内置 source 行为: +- actor runtime +- rpc codec / transport 实验 +- scheduler / worker / client 链路 +- 同机 smoke 与 benchmark 工具 -- `MemoryStreamSource`:在 `best-effort` 下可按 batch offset 恢复 -- `DirectoryCsvStreamSource`:在 `best-effort` 下可按最后完成的文件恢复 +仓库入口: -### Actor-Credit 与 Auto +- Bazel source group: + - `//:velaria_experimental_sources` +- 回归套件: + - `//:experimental_regression` +- shell 入口: + - `./scripts/run_experimental_regression.sh` -`actor-credit` 和 `auto` 只服务一个很窄的热路径: +### Examples -- 前置变换必须全部是 partition-local -- 最终 barrier 必须按 `window_start + key` 分组 -- 聚合必须是 `sum(value)` +examples 与 helper scripts 只用于说明各层,不定义各层。 -不满足这组条件的 query 必须回退到 `single-process`,并通过 `execution_reason` 说明原因。 +- Bazel source group: + - `//:velaria_examples_sources` -## 流式 SQL +## Runtime Contract -Velaria 刻意把 stream SQL 保持在一个很小的子集内,并映射回现有 streaming operators。 +稳定的 runtime contract 文档位于 [docs/runtime-contract.md](./docs/runtime-contract.md)。 -### 入口 +主要 stream 入口: -- `session.streamSql("SELECT ...") -> StreamingDataFrame` -- `session.startStreamSql("INSERT INTO sink_table SELECT ...", options) -> StreamingQuery` -- `session.explainStreamSql(...) -> string` +- `session.readStream(source)` +- `session.readStreamCsvDir(path)` +- `session.streamSql(sql)` +- `session.explainStreamSql(sql, options)` +- `session.startStreamSql(sql, options)` +- `StreamingDataFrame.writeStream(sink, options)` -### 当前支持 +稳定 stream contract surface: -- 单表 `SELECT` -- `WHERE` -- `GROUP BY` -- `HAVING` -- `LIMIT` -- `SUM(col)` -- `COUNT(*)` -- 最小 window SQL:`WINDOW BY EVERY AS ` +- `StreamingQueryProgress` +- `snapshotJson()` +- `explainStreamSql(...)` +- `execution_mode / execution_reason / transport_mode` +- `checkpoint_delivery_mode` +- source/sink lifecycle:`open -> nextBatch -> checkpoint -> ack -> close` -支持的 DDL/DML: +`explainStreamSql(...)` 固定返回: -- `CREATE SOURCE TABLE ... USING csv` -- `CREATE SINK TABLE ... USING csv` -- `INSERT INTO sink_table SELECT ...` +- `logical` +- `physical` +- `strategy` -当前 stream SQL 不支持: +其中 `strategy` 是 mode 选择、fallback reason、transport、backpressure threshold 与 checkpoint delivery mode 的唯一解释出口。 -- `JOIN` -- `AVG / MIN / MAX` -- `INSERT INTO ... VALUES` -- 宽泛 ANSI window SQL -- CTE / 子查询 / `UNION` +## 当前能力边界 -示例: +当前已具备: -```sql -CREATE SOURCE TABLE stream_events (ts STRING, key STRING, value INT) -USING csv OPTIONS(path '/tmp/stream-input', delimiter ','); +- 本地 batch + streaming 共用一个内核 +- `read_csv`, `readStream(...)`, `readStreamCsvDir(...)` +- query-local 反压、有界 backlog、progress snapshot、checkpoint path +- 执行模式:`single-process`、`local-workers`、`actor-credit`、`auto` +- 文件 source/sink +- 基础 streaming operators:`select / filter / withColumn / drop / limit / window` +- stateful `sum` 和 `count` +- 最小 stream SQL 子集 +- 固定维度 float vector 的本地检索 +- Python Arrow 输入/输出 +- 同机 actor/rpc/jobmaster smoke 链路 -CREATE SINK TABLE stream_summary (window_start STRING, key STRING, value_sum INT) -USING csv OPTIONS(path '/tmp/stream-output.csv', delimiter ','); +当前明确不做: -INSERT INTO stream_summary -SELECT window_start, key, SUM(value) AS value_sum -FROM stream_events -WINDOW BY ts EVERY 60000 AS window_start -GROUP BY window_start, key; -``` +- 宣称已完成 distributed runtime +- 把 Python callback 拉进热路径 +- Python UDF +- 把 actor 并行化扩成任意 plan 的通用机制 +- 宽泛 SQL 扩展,例如完整 `JOIN / CTE / subquery / UNION` +- ANN / 独立 vector DB / 分布式 vector 执行 -## Python API +## Python Ecosystem -Python 继续只做前端和交换层,不进入执行热路径。 +Python 继续是正式支持的 ingress 与打包层,但不成为执行内核。 -主要 API: +当前支持的 Python surface: - `Session.read_csv(...)` - `Session.sql(...)` @@ -190,69 +199,66 @@ Python 继续只做前端和交换层,不进入执行热路径。 - `Session.stream_sql(...)` - `Session.explain_stream_sql(...)` - `Session.start_stream_sql(...)` +- `Session.vector_search(...)` +- `Session.explain_vector_search(...)` +- `read_excel(...)` +- custom source / custom sink adapter -Arrow ingestion 支持: +本仓库中的 Python 命令统一使用 `uv`: -- `pyarrow.Table` -- `pyarrow.RecordBatch` -- `RecordBatchReader` -- 实现了 `__arrow_c_stream__` 的对象 -- Arrow batch 的 Python 序列 - -### XLSX 读取 +```bash +bazel build //:velaria_pyext +uv sync --project python_api --python python3.12 +uv run --project python_api python python_api/examples/demo_batch_sql_arrow.py +uv run --project python_api python python_api/examples/demo_stream_sql.py +uv run --project python_api python python_api/examples/demo_vector_search.py +``` -仓库也支持直接读取 `.xlsx` 文件为 Velaria DataFrame。 +Python ecosystem 构建 / 测试前提: -使用方式为 `velaria.read_excel(session, path, ...)`: +- `uv` +- 一个带 `Python.h` 的本地 CPython +- 当 Bazel 不能自动发现可用解释器时,设置 `VELARIA_PYTHON_BIN` -```python -from velaria import Session, read_excel +推荐回归入口: -session = Session() -df = read_excel(session, "/path/to/file.xlsx", sheet_name="Sheet1") -session.create_temp_view("excel_source", df) -print(session.sql("SELECT * FROM excel_source LIMIT 5").to_rows()) +```bash +./scripts/run_python_ecosystem_regression.sh ``` -该能力依赖 `pandas` 与 `openpyxl`(已作为 Python 包依赖): +## Local Vector Search -```bash -uv run python -c "import pandas, openpyxl" -``` +vector search 是本地内核能力,不是新子系统。 -本仓库里的 Python 命令统一使用 `uv`: +`v0.1` 范围: -```bash -bazel build //:velaria_pyext -uv sync --project python_api --python python3.12 -uv run --project python_api python python_api/demo_batch_sql_arrow.py -uv run --project python_api python python_api/demo_stream_sql.py -``` +- fixed-dimension `float32` +- 指标:`cosine`、`dot`、`l2` +- `top-k` +- exact scan only +- `DataFrame` / `DataflowSession` +- Python `Session.vector_search(...)` +- Arrow `FixedSizeList` +- explain 输出 -同时在 Session 侧新增了向量查询入口:`Session.vectorQuery(table, vector_column, query_vector, top_k, metric)`(metric 支持 cosine/dot/l2),以及 explain 接口 `Session.explainVectorQuery(...)`。 +推荐的本地 CSV vector 文本格式: -支持打包单文件 CLI 可执行产物(内含 Python 运行时依赖 + native `_velaria.so`): +- `[1 2 3]` +- `[1,2,3]` -```bash -./scripts/build_py_cli_executable.sh -./dist/velaria-cli csv-sql \ - --csv /path/to/input.csv \ - --query "SELECT * FROM input_table LIMIT 5" -``` +设计文档: + +- [docs/local_vector_search_v01.md](./docs/local_vector_search_v01.md) -额外支持直接编译 native CLI 二进制(运行时不依赖 Python 环境): +CLI 示例: ```bash bazel build //:velaria_cli ./bazel-bin/velaria_cli \ --csv /path/to/input.csv \ --query "SELECT * FROM input_table LIMIT 5" -``` - -native CLI 向量查询(fixed length vector,支持 cosine/cosin、dot 与 l2): -```bash -./bazel-bin/velaria_cli \ +./dist/velaria-cli vector-search \ --csv /path/to/vectors.csv \ --vector-column embedding \ --query-vector "0.1,0.2,0.3" \ @@ -260,76 +266,65 @@ native CLI 向量查询(fixed length vector,支持 cosine/cosin、dot 与 l2 --top-k 5 ``` -runtime 传输层现已在 proto-like 与 binary row batch codec 中保留 `FixedVector` 类型,跨进程传输时不会丢失向量维度语义。 -FixedVector 在内部 codec 里改为 raw float bit payload 编码,避免文本往返造成的精度损耗。 -当前向量检索范围为本地 exact scan(`mode=exact-scan`)+ 固定维度 float 向量;v0.1 不包含 ANN 与分布式执行路径。 -Arrow ingestion 已增加 `FixedSizeList` 的 native 快路径,可减少向量列的 Python 对象转换开销。 -同机 actor runtime 的结果回传现在采用“双帧”模型:控制消息继续走 `actor-rpc-v1`,结果表单独走 `table-bin-v1` 的 `DataBatch` 帧,并通过 `correlation_id` 关联;热路径不再把整张结果表塞进 actor JSON body。 +vector explain 是稳定 contract 的一部分,当前要求至少包含: -## 同机多进程实验路径 +- `mode=exact-scan` +- `metric=` +- `dimension=` +- `top_k=` +- `candidate_rows=` +- `filter_pushdown=false` +- `acceleration=flat-buffer+heap-topk` -同机路径刻意保持最小: - -- scheduler 接收提交并维护 snapshot -- worker 执行本地 operator chain -- dashboard 和 client 都必须走 worker 执行链 - -构建: +benchmark 基线入口: ```bash -bazel build //:actor_rpc_scheduler //:actor_rpc_worker //:actor_rpc_client //:actor_rpc_smoke +./scripts/run_vector_search_benchmark.sh ``` -smoke: +该脚本默认跑轻量 `--quick` 基线;如需完整 sweep,直接执行 `bazel run //:vector_search_benchmark`。 -```bash -bazel run //:actor_rpc_smoke -``` +## Experimental Runtime -该 smoke 现会同时校验 actor 控制消息和关联的二进制 `DataBatch` 结果帧。 +同机路径继续刻意保持收敛: -三进程本地运行: - -```bash -bazel run //:actor_rpc_scheduler -- --listen 127.0.0.1:61000 --node-id scheduler --dashboard-enabled --dashboard-listen 127.0.0.1:8080 -bazel run //:actor_rpc_worker -- --connect 127.0.0.1:61000 --node-id worker-1 -bazel run //:actor_rpc_client -- --connect 127.0.0.1:61000 --payload "demo payload" +```text +client -> scheduler(jobmaster) -> worker -> in-proc operator chain -> result ``` -Dashboard: +它存在的目的: -- 地址:`http://127.0.0.1:8080` -- 源码:`src/dataflow/runner/dashboard/app.ts` -- 构建目标:`//:dashboard_app_js` +- 同机执行实验 +- transport 与 codec 观测 +- benchmark 与 observability 开发 -## Benchmark 与 Observability +它不代表: -常用本地目标: +- 已完成 distributed scheduling +- 已完成 distributed fault recovery +- 已完成 cluster resource governance +- 已支持 production 级 distributed vector execution -- `//:stream_benchmark` -- `//:stream_actor_benchmark` -- `//:tpch_q1_style_benchmark` -- `//:vector_search_benchmark` - -向量 benchmark: +构建: ```bash -bazel run //:vector_search_benchmark +bazel build //:actor_rpc_scheduler //:actor_rpc_worker //:actor_rpc_client //:actor_rpc_smoke ``` -会输出两类 JSON 行: +smoke: -- `vector-query`:cold query、warm query、warm explain 延迟 -- `vector-transport`:proto-like 与 `BinaryRowBatch` 的编解码耗时、payload 大小,以及 actor 控制帧开销 +```bash +bazel run //:actor_rpc_smoke +``` -同机 observability regression: +三进程本地运行: ```bash -./scripts/run_stream_observability_regression.sh +bazel run //:actor_rpc_scheduler -- --listen 127.0.0.1:61000 --node-id scheduler +bazel run //:actor_rpc_worker -- --connect 127.0.0.1:61000 --node-id worker-1 +bazel run //:actor_rpc_client -- --connect 127.0.0.1:61000 --payload "demo payload" ``` -这些 benchmark 会输出结构化 profile,用于同机执行路径诊断和回归跟踪,不用于把绝对吞吐做成机器敏感的硬门槛。 - ## 构建与验证 单机基线: @@ -340,20 +335,26 @@ bazel run //:df_demo bazel run //:stream_demo ``` -核心回归集: +分层回归入口: + +```bash +./scripts/run_core_regression.sh +./scripts/run_python_ecosystem_regression.sh +./scripts/run_experimental_regression.sh +``` + +直接使用 Bazel suite: ```bash -bazel test //:sql_regression_test //:planner_v03_test //:stream_runtime_test //:stream_actor_credit_test //:source_sink_abi_test //:stream_strategy_explain_test -bazel test //python_api:custom_stream_source_test //python_api:streaming_v05_test //python_api:arrow_stream_ingestion_test +bazel test //:core_regression +bazel test //:python_ecosystem_regression +bazel test //:experimental_regression ``` -一行 build/smoke 摘要: +同机 observability regression: ```bash -bazel build //:sql_demo //:df_demo //:stream_demo \ - //:actor_rpc_scheduler //:actor_rpc_worker //:actor_rpc_client //:actor_rpc_smoke \ - && bazel run //:actor_rpc_smoke \ - && echo '[summary] build+smoke ok' +./scripts/run_stream_observability_regression.sh ``` ## 仓库规则 @@ -361,16 +362,7 @@ bazel build //:sql_demo //:df_demo //:stream_demo \ - 语言基线:`C++17` - 构建系统:`Bazel` - 对外 session 入口保持为 `DataflowSession` -- 扩展同机实验时不要破坏 `sql_demo / df_demo / stream_demo` +- 不要破坏 `sql_demo / df_demo / stream_demo` - 示例源码统一使用 `.cc` - 本仓库中的 Python 命令统一使用 `uv` - -## CI 与打包 - -CI 维持收敛: - -- PR CI 覆盖 native build、回归测试和 Python smoke -- wheel job 生成 Linux 与 macOS native wheel -- release 通过 tag 驱动,并校验 `velaria.__version__` - -目标是让日常开发成本保持可控,同时验证当前真正暴露出去的接口面。 +- `README.md` 与 `README-zh.md` 必须保持同步 diff --git a/README.md b/README.md index 8c93c6f..7b33a06 100644 --- a/README.md +++ b/README.md @@ -1,185 +1,194 @@ -# Velaria: A Pure C++17 Streaming-First Dataflow Kernel +# Velaria: A Pure C++17 Local Dataflow Kernel `README.md` is the English source of truth. The Chinese mirror lives in [README-zh.md](./README-zh.md). Keep both files aligned. -Velaria is a local-first C++17 dataflow engine research project. The current goal is narrow on purpose: stabilize the single-machine streaming path, keep batch and stream inside one execution model, and extend carefully toward same-host multi-process execution without claiming a finished distributed runtime. +Velaria is a local-first C++17 dataflow engine research project. The repository is now organized around one kernel plus two explicit non-kernel layers: -## What It Is +- `Core Kernel` + - local execution semantics + - batch + stream in one model + - stable explain / progress / checkpoint contract +- `Python Ecosystem` + - supported Arrow / wheel / CLI / `uv` / Excel / Bitable / custom stream adapters + - projects the kernel outward without becoming the hot path +- `Experimental Runtime` + - same-host `actor/rpc/jobmaster` + - execution and observability research lane, not a second kernel -Today the main path is: +## Golden Path -`micro-batch in-proc + query-local backpressure + local worker scale-up` - -The repository also keeps `actor + rpc + jobmaster` as a same-host multi-process experiment. That path is for observability and execution research, not for claiming distributed scheduling, fault recovery, state migration, or resource governance. - -Core public objects: - -- `DataflowSession` -- `DataFrame` -- `StreamingDataFrame` -- `StreamingQuery` - -Core local flow: +The only golden path is: ```text -source -> StreamingDataFrame/operator chain -> sink +Arrow / CSV / Python ingress + -> DataflowSession / DataFrame / StreamingDataFrame + -> local runtime kernel + -> sink + -> explain / progress / checkpoint ``` -Same-host experimental flow: - -```text -client -> scheduler(jobmaster) -> worker -> in-proc operator chain -> result -``` - -## Current Capability Boundary - -Available today: - -- batch + streaming execution through one local runtime -- `read_csv`, `readStream(...)`, `readStreamCsvDir(...)` -- query-local backpressure, bounded backlog, progress snapshots, checkpoint path -- execution modes: `single-process`, `local-workers`, `actor-credit`, `auto` -- local file source/sink support -- basic stream operators: `select / filter / withColumn / drop / limit / window` -- stateful `sum` and `count` -- stream SQL subset on top of existing streaming operators -- Python Arrow ingestion and Arrow output -- same-host actor/rpc/jobmaster smoke path +Public session entry: -Out of scope in the current repo state: +- `DataflowSession` -- completed distributed runtime claims -- Python callback execution in the hot path -- Python UDFs -- Python sink callbacks into the native sink ABI -- generic plan parallelization from `actor-credit` -- broad SQL expansion such as full `JOIN / CTE / subquery / UNION / streaming AVG/MIN/MAX` +Core user-facing objects: -## Streaming Runtime Contracts +- `DataFrame` +- `StreamingDataFrame` +- `StreamingQuery` -Main streaming entry points: +The same-host actor/rpc path stays in the repo, but it is not the main product story. -- `session.readStream(source)` -- `session.readStreamCsvDir(path)` -- `session.streamSql(sql)` -- `session.explainStreamSql(sql, options)` -- `session.startStreamSql(sql, options)` -- `StreamingDataFrame.writeStream(sink, options)` +## Repository Layers -### Progress and Strategy +### Core Kernel -`StreamingQueryProgress` and `snapshotJson()` expose: +Core owns: -- execution choice: `execution_mode`, `execution_reason`, `transport_mode` -- workload estimates: `estimated_state_size_bytes`, `estimated_batch_cost` -- source/sink state: `source_is_bounded`, `sink_is_blocking` -- flow-control counters: backlog, inflight, blocked, watermark fields -- checkpoint fields: `checkpoint_delivery_mode`, `last_source_offset` +- logical planning and minimal SQL mapping +- table/value execution model +- local batch and streaming runtime +- source/sink ABI +- runtime contract surfaces +- local vector search capability -`explainStreamSql(...)` returns three sections: +Repository entrypoints: -- `logical` -- `physical` -- `strategy` +- docs: + - [docs/core-boundary.md](./docs/core-boundary.md) + - [docs/runtime-contract.md](./docs/runtime-contract.md) + - [docs/streaming_runtime_design.md](./docs/streaming_runtime_design.md) +- Bazel source groups: + - `//:velaria_core_logical_sources` + - `//:velaria_core_execution_sources` + - `//:velaria_core_contract_sources` +- regression suite: + - `//:core_regression` -The strategy section explains selected mode, fallback reason, actor hot-path eligibility, transport, batch/state estimates, and the actor/shared-memory knobs used for the decision. +### Python Ecosystem -### Backpressure +Python is a supported ecosystem layer, not a convenience-only wrapper. -Velaria currently defines backpressure as query-local and bounded: +It includes: -- `backlog` is the queued batch count after pull and before drain -- `blocked_count` counts producer wait events, not loop iterations -- `max_backlog_batches` is the largest observed backlog immediately after enqueue -- `inflight_batches` and `inflight_partitions` describe queued work not yet consumed -- slow sink, slow state finalize, or local partition pressure feed back into the same query-local backlog counters +- native binding in `python_api` +- Arrow ingestion and output +- `uv` workflow +- wheel / native wheel / CLI packaging +- Excel and Bitable adapters +- custom source / custom sink adapters +- supported CLI tooling in `python_api/velaria_cli.py` +- Python ecosystem demos in `python_api/examples` +- Python benchmarks in `python_api/benchmarks` -Latency fields: +It does not define: -- `last_batch_latency_ms`: batch start to sink flush completion -- `last_sink_latency_ms`: sink write + flush -- `last_state_latency_ms`: state/window finalize time, `0` for stateless batches +- execution hot-path behavior +- independent progress/checkpoint semantics +- independent vector-search semantics -### Checkpoint and Resume +Repository entrypoints: -Checkpoint files are local and persisted atomically. +- docs: + - [python_api/README.md](./python_api/README.md) +- Bazel source group: + - `//:velaria_python_ecosystem_sources` +- Python-layer source groups: + - `//python_api:velaria_python_supported_sources` + - `//python_api:velaria_python_example_sources` + - `//python_api:velaria_python_experimental_sources` +- regression suite: + - `//:python_ecosystem_regression` +- Python-layer regression suite: + - `//python_api:velaria_python_supported_regression` +- shell entrypoint: + - `./scripts/run_python_ecosystem_regression.sh` -Current delivery semantics: +### Experimental Runtime -- default `at-least-once`: source offset is not restored; replay and duplicate sink output are allowed -- `best-effort`: restore source offset only when the source implements `restoreOffsetToken(...)`; still not exactly-once sink delivery +Experimental runtime includes: -Current built-in source behavior: +- actor runtime +- rpc codec / transport experiments +- scheduler / worker / client flow +- same-host smoke and benchmark tools -- `MemoryStreamSource`: can resume by batch offset under `best-effort` -- `DirectoryCsvStreamSource`: can resume by last completed file under `best-effort` +Repository entrypoints: -### Actor-Credit and Auto +- Bazel source group: + - `//:velaria_experimental_sources` +- regression suite: + - `//:experimental_regression` +- shell entrypoint: + - `./scripts/run_experimental_regression.sh` -`actor-credit` and `auto` only target one narrow hot path: +### Examples -- all upstream transforms must be partition-local -- the final barrier must be grouped by `window_start + key` -- the aggregate must be `sum(value)` +Examples and helper scripts illustrate layers; they do not define them. -Queries outside that shape must fall back to `single-process`, and the reason is recorded in `execution_reason`. +- Bazel source group: + - `//:velaria_examples_sources` -## Streaming SQL +## Runtime Contract -Velaria keeps stream SQL intentionally small and maps it back to existing streaming operators. +The stable runtime-facing contract is documented in [docs/runtime-contract.md](./docs/runtime-contract.md). -### Entry Points +Main stream entry points: -- `session.streamSql("SELECT ...") -> StreamingDataFrame` -- `session.startStreamSql("INSERT INTO sink_table SELECT ...", options) -> StreamingQuery` -- `session.explainStreamSql(...) -> string` +- `session.readStream(source)` +- `session.readStreamCsvDir(path)` +- `session.streamSql(sql)` +- `session.explainStreamSql(sql, options)` +- `session.startStreamSql(sql, options)` +- `StreamingDataFrame.writeStream(sink, options)` -### Supported Subset +Stable stream contract surfaces: -- single-table `SELECT` -- `WHERE` -- `GROUP BY` -- `HAVING` -- `LIMIT` -- `SUM(col)` -- `COUNT(*)` -- minimal window SQL: `WINDOW BY EVERY AS ` +- `StreamingQueryProgress` +- `snapshotJson()` +- `explainStreamSql(...)` +- `execution_mode / execution_reason / transport_mode` +- `checkpoint_delivery_mode` +- source/sink lifecycle: `open -> nextBatch -> checkpoint -> ack -> close` -Supported DDL/DML: +`explainStreamSql(...)` always returns: -- `CREATE SOURCE TABLE ... USING csv` -- `CREATE SINK TABLE ... USING csv` -- `INSERT INTO sink_table SELECT ...` +- `logical` +- `physical` +- `strategy` -Not supported in stream SQL: +`strategy` is the single explanation outlet for mode selection, fallback reason, transport, backpressure thresholds, and checkpoint delivery mode. -- `JOIN` -- `AVG / MIN / MAX` -- `INSERT INTO ... VALUES` -- broad ANSI window SQL -- CTE / subquery / `UNION` +## Current Capability Boundary -Example: +Available today: -```sql -CREATE SOURCE TABLE stream_events (ts STRING, key STRING, value INT) -USING csv OPTIONS(path '/tmp/stream-input', delimiter ','); +- local batch + streaming execution through one kernel +- `read_csv`, `readStream(...)`, `readStreamCsvDir(...)` +- query-local backpressure, bounded backlog, progress snapshots, checkpoint path +- execution modes: `single-process`, `local-workers`, `actor-credit`, `auto` +- file source/sink support +- basic stream operators: `select / filter / withColumn / drop / limit / window` +- stateful `sum` and `count` +- minimal stream SQL subset +- local vector search on fixed-dimension float vectors +- Python Arrow ingestion and output +- same-host actor/rpc/jobmaster smoke path -CREATE SINK TABLE stream_summary (window_start STRING, key STRING, value_sum INT) -USING csv OPTIONS(path '/tmp/stream-output.csv', delimiter ','); +Out of scope in the current repo state: -INSERT INTO stream_summary -SELECT window_start, key, SUM(value) AS value_sum -FROM stream_events -WINDOW BY ts EVERY 60000 AS window_start -GROUP BY window_start, key; -``` +- completed distributed runtime claims +- Python callback execution in the hot path +- Python UDFs +- generic actor parallelization for arbitrary plans +- broad SQL expansion such as full `JOIN / CTE / subquery / UNION` +- ANN / standalone vector DB / distributed vector execution -## Python API +## Python Ecosystem -Python remains an exchange/front-end layer. It does not become the execution hot path. +Python remains a supported ingress and packaging layer. It does not become the execution core. -Main API: +Main supported Python surfaces: - `Session.read_csv(...)` - `Session.sql(...)` @@ -190,48 +199,66 @@ Main API: - `Session.stream_sql(...)` - `Session.explain_stream_sql(...)` - `Session.start_stream_sql(...)` -- `Session.vectorQuery(table, vector_column, query_vector, top_k, metric)` (`metric`: cosine/dot/l2) -- `Session.explainVectorQuery(table, vector_column, query_vector, top_k, metric)` - -Arrow ingestion accepts: +- `Session.vector_search(...)` +- `Session.explain_vector_search(...)` +- `read_excel(...)` +- custom source / custom sink adapters -- `pyarrow.Table` -- `pyarrow.RecordBatch` -- `RecordBatchReader` -- objects implementing `__arrow_c_stream__` -- Python sequences of Arrow batches - -Python commands in this repo should use `uv`: +Python ecosystem commands in this repo use `uv`: ```bash bazel build //:velaria_pyext uv sync --project python_api --python python3.12 -uv run --project python_api python python_api/demo_batch_sql_arrow.py -uv run --project python_api python python_api/demo_stream_sql.py +uv run --project python_api python python_api/examples/demo_batch_sql_arrow.py +uv run --project python_api python python_api/examples/demo_stream_sql.py +uv run --project python_api python python_api/examples/demo_vector_search.py ``` -Build a single-file CLI executable (bundles Python runtime deps + native `_velaria.so`): +Python ecosystem build/test prerequisites: + +- `uv` +- a local CPython interpreter with `Python.h` +- `VELARIA_PYTHON_BIN` when Bazel cannot auto-discover a usable interpreter + +Recommended regression entrypoint: ```bash -./scripts/build_py_cli_executable.sh -./dist/velaria-cli csv-sql \ - --csv /path/to/input.csv \ - --query "SELECT * FROM input_table LIMIT 5" +./scripts/run_python_ecosystem_regression.sh ``` -Build a native CLI binary (no Python runtime dependency required at runtime): +## Local Vector Search + +Vector search is a local kernel capability, not a new subsystem. + +Scope in `v0.1`: + +- fixed-dimension `float32` +- metrics: `cosine`, `dot`, `l2` +- `top-k` +- exact scan only +- `DataFrame` / `DataflowSession` +- Python `Session.vector_search(...)` +- Arrow `FixedSizeList` +- explain output + +Preferred local CSV vector text shape: + +- `[1 2 3]` +- `[1,2,3]` + +Design doc: + +- [docs/local_vector_search_v01.md](./docs/local_vector_search_v01.md) + +CLI examples: ```bash bazel build //:velaria_cli ./bazel-bin/velaria_cli \ --csv /path/to/input.csv \ --query "SELECT * FROM input_table LIMIT 5" -``` - -Vector query (fixed-length vector, cosine/dot/l2) via native CLI: -```bash -./bazel-bin/velaria_cli \ +./dist/velaria-cli vector-search \ --csv /path/to/vectors.csv \ --vector-column embedding \ --query-vector "0.1,0.2,0.3" \ @@ -239,76 +266,65 @@ Vector query (fixed-length vector, cosine/dot/l2) via native CLI: --top-k 5 ``` -Runtime-level vector transport now preserves `FixedVector` through proto-like and binary row batch codecs, so cross-process payloads keep vector type and dimensions. -FixedVector serialization now uses raw float bit payload encoding in internal codecs to avoid text round-trip precision loss. -Current vector search scope is local-only exact scan (`mode=exact-scan`) with fixed-dimension float vectors; no ANN/distributed path in v0.1. -Arrow ingestion now includes a direct `FixedSizeList` fast path in the native bridge, reducing Python object conversion overhead on vector columns. -For same-host actor runtime results, the control message stays on `actor-rpc-v1`, while the result table is forwarded as a separate `table-bin-v1` `DataBatch` frame linked by `correlation_id`. The hot result path no longer puts row payloads inside the actor JSON body. +Vector explain is part of the stable contract. Current required fields include: -## Same-Host Multi-Process Experiment +- `mode=exact-scan` +- `metric=` +- `dimension=` +- `top_k=` +- `candidate_rows=` +- `filter_pushdown=false` +- `acceleration=flat-buffer+heap-topk` -The same-host path is intentionally minimal: - -- scheduler accepts submission and keeps snapshots -- worker runs the local operator chain -- dashboard and client both submit through the worker execution path - -Build: +Benchmark baseline: ```bash -bazel build //:actor_rpc_scheduler //:actor_rpc_worker //:actor_rpc_client //:actor_rpc_smoke +./scripts/run_vector_search_benchmark.sh ``` -Smoke: +The script runs a quick exact-scan baseline. Use `bazel run //:vector_search_benchmark` for the full sweep. -```bash -bazel run //:actor_rpc_smoke -``` +## Experimental Runtime -The smoke target now verifies both the actor control message and the correlated binary `DataBatch` result frame. +The same-host path stays intentionally narrow: -Three-process local run: - -```bash -bazel run //:actor_rpc_scheduler -- --listen 127.0.0.1:61000 --node-id scheduler --dashboard-enabled --dashboard-listen 127.0.0.1:8080 -bazel run //:actor_rpc_worker -- --connect 127.0.0.1:61000 --node-id worker-1 -bazel run //:actor_rpc_client -- --connect 127.0.0.1:61000 --payload "demo payload" +```text +client -> scheduler(jobmaster) -> worker -> in-proc operator chain -> result ``` -Dashboard: +It exists for: -- URL: `http://127.0.0.1:8080` -- source: `src/dataflow/runner/dashboard/app.ts` -- build target: `//:dashboard_app_js` +- same-host execution experiments +- transport and codec observation +- benchmark and observability development -## Benchmarks and Observability +It does not imply: -Useful local targets: +- distributed scheduling +- distributed fault recovery +- cluster resource governance +- production distributed vector execution -- `//:stream_benchmark` -- `//:stream_actor_benchmark` -- `//:tpch_q1_style_benchmark` -- `//:vector_search_benchmark` - -Vector benchmark: +Build: ```bash -bazel run //:vector_search_benchmark +bazel build //:actor_rpc_scheduler //:actor_rpc_worker //:actor_rpc_client //:actor_rpc_smoke ``` -It emits JSON lines for: +Smoke: -- `vector-query`: cold query, warm query, and warm explain latency -- `vector-transport`: proto-like vs `BinaryRowBatch` serialize/deserialize cost and payload size, plus actor control-frame overhead +```bash +bazel run //:actor_rpc_smoke +``` -Same-host observability regression: +Three-process local run: ```bash -./scripts/run_stream_observability_regression.sh +bazel run //:actor_rpc_scheduler -- --listen 127.0.0.1:61000 --node-id scheduler +bazel run //:actor_rpc_worker -- --connect 127.0.0.1:61000 --node-id worker-1 +bazel run //:actor_rpc_client -- --connect 127.0.0.1:61000 --payload "demo payload" ``` -The benchmark path exposes structured profile output for same-host execution diagnosis. This is for regression tracking and explainability, not for machine-stable absolute throughput gates. - ## Build and Verification Single-node baseline: @@ -319,20 +335,26 @@ bazel run //:df_demo bazel run //:stream_demo ``` -Core regression set: +Layered regression entrypoints: + +```bash +./scripts/run_core_regression.sh +./scripts/run_python_ecosystem_regression.sh +./scripts/run_experimental_regression.sh +``` + +Direct Bazel suites: ```bash -bazel test //:sql_regression_test //:planner_v03_test //:stream_runtime_test //:stream_actor_credit_test //:source_sink_abi_test //:stream_strategy_explain_test -bazel test //python_api:custom_stream_source_test //python_api:streaming_v05_test //python_api:arrow_stream_ingestion_test +bazel test //:core_regression +bazel test //:python_ecosystem_regression +bazel test //:experimental_regression ``` -One-line build/smoke summary: +Same-host observability regression: ```bash -bazel build //:sql_demo //:df_demo //:stream_demo \ - //:actor_rpc_scheduler //:actor_rpc_worker //:actor_rpc_client //:actor_rpc_smoke \ - && bazel run //:actor_rpc_smoke \ - && echo '[summary] build+smoke ok' +./scripts/run_stream_observability_regression.sh ``` ## Repository Rules @@ -340,16 +362,7 @@ bazel build //:sql_demo //:df_demo //:stream_demo \ - language baseline: `C++17` - build system: `Bazel` - keep `DataflowSession` as the public session entry -- do not break `sql_demo / df_demo / stream_demo` while extending same-host experiments +- do not break `sql_demo / df_demo / stream_demo` - keep example source files as `.cc` - use `uv` for Python commands in this repository - -## CI and Packaging - -CI stays intentionally narrow: - -- PR CI covers native build, regression tests, and Python smoke -- wheel jobs build Linux and macOS native wheels -- releases are tag-driven and validated against `velaria.__version__` - -The goal is to keep routine development cheap while still validating the public surfaces that currently matter. +- keep `README.md` and `README-zh.md` aligned diff --git a/docs/core-boundary.md b/docs/core-boundary.md new file mode 100644 index 0000000..5c82a38 --- /dev/null +++ b/docs/core-boundary.md @@ -0,0 +1,154 @@ +# Velaria Core Boundary + +## Summary + +Velaria is organized around one kernel and two non-kernel layers: + +- `Core Kernel` + - pure `C++17` + - local-first execution + - batch and stream share one execution model + - runtime behavior is exposed through stable contract surfaces +- `Python Ecosystem` + - supported ingress, interop, packaging, and automation layer + - Arrow, wheel/CLI, `uv`, Excel/Bitable, custom stream adapters + - projects core behavior outward, but does not define execution semantics +- `Experimental Runtime` + - same-host `actor/rpc/jobmaster` + - observability and execution research lane + - not a second production kernel + +The only golden path is: + +```text +Arrow / CSV / Python ingress + -> DataflowSession / DataFrame / StreamingDataFrame + -> local runtime kernel + -> sink + -> explain / progress / checkpoint +``` + +## Layering + +### Core Kernel + +Core owns the semantics that must stay stable across C++, Python, demos, and future integrations: + +- logical planning and minimal SQL mapping +- table/value model +- local execution and streaming runtime +- vector search as a local index capability +- `DataflowSession`, `DataFrame`, `StreamingDataFrame` +- source/sink ABI +- progress / checkpoint / explain contract + +Repository view: + +- Bazel source groups: + - `//:velaria_core_logical_sources` + - `//:velaria_core_execution_sources` + - `//:velaria_core_contract_sources` +- regression entrypoint: + - `//:core_regression` + +### Python Ecosystem + +Python is a supported ecosystem layer, not a convenience sidecar. + +Python ecosystem owns: + +- native binding surface in `python_api` +- supported library modules in `python_api/velaria` +- supported CLI tooling in `python_api/velaria_cli.py` +- Arrow ingestion/output +- `uv`-based development and test workflow +- wheel, native wheel, and CLI packaging +- Excel and Bitable adapters +- custom source / custom sink adapters +- Python-facing demos in `python_api/examples` +- Python-facing benchmarks in `python_api/benchmarks` + +Python ecosystem must not: + +- redefine runtime contract semantics +- introduce a Python hot path for core execution +- become the source of truth for progress/checkpoint/explain behavior +- require experimental runtime components for normal operation + +Repository view: + +- Bazel source group: + - `//:velaria_python_ecosystem_sources` +- Python-layer source groups: + - `//python_api:velaria_python_supported_sources` + - `//python_api:velaria_python_example_sources` + - `//python_api:velaria_python_experimental_sources` +- regression entrypoint: + - `//:python_ecosystem_regression` +- Python-layer regression entrypoint: + - `//python_api:velaria_python_supported_regression` +- shell entrypoint: + - `./scripts/run_python_ecosystem_regression.sh` + +### Experimental Runtime + +Experimental runtime remains in the repo because it is useful for same-host execution and observability research. + +It includes: + +- actor runtime +- rpc codec and same-host transport experiments +- scheduler / worker / client flow +- same-host benchmark and smoke scripts + +It does not redefine: + +- the public session entry +- batch/stream semantics +- checkpoint delivery contract +- vector query semantics + +Repository view: + +- Bazel source group: + - `//:velaria_experimental_sources` +- regression entrypoint: + - `//:experimental_regression` +- shell entrypoint: + - `./scripts/run_experimental_regression.sh` + +### Examples + +Examples demonstrate layers; they do not define them. + +Examples include: + +- single-node demos +- vector benchmarks +- same-host smoke tools +- local helper scripts and skills + +Repository view: + +- Bazel source group: + - `//:velaria_examples_sources` + +## Ownership Rules + +- `DataflowSession` remains the only public session entry. +- SQL stays an ingress surface. It does not back-drive runtime design. +- Python remains supported, but cannot become the execution core. +- Vector search remains a core local capability, not a new subsystem. +- Same-host actor/rpc stays experimental, even when it is featureful. +- `sql_demo / df_demo / stream_demo` are the single-node baseline and must remain intact. + +## Full-Reorg Note + +This reorg is implemented first through repository-facing structure: + +- layered Bazel source groups +- layered regression suites +- layered documentation +- README and Python ecosystem reordering + +Source paths still live under the existing `src/dataflow` and `python_api` roots so the current build graph and examples remain stable while the layer boundaries become explicit and enforceable. diff --git a/docs/local_vector_search_v01.md b/docs/local_vector_search_v01.md index b841fca..6d85094 100644 --- a/docs/local_vector_search_v01.md +++ b/docs/local_vector_search_v01.md @@ -1,5 +1,7 @@ # Local Vector Search v0.1 (Velaria) +This document describes the minimal local vector search design. For the stable runtime-facing explain and ecosystem contract, see [runtime-contract.md](./runtime-contract.md). For repository positioning, see [core-boundary.md](./core-boundary.md). + ## Scope This document defines a minimal local-first vector search path for Velaria. @@ -45,6 +47,13 @@ This document defines a minimal local-first vector search path for Velaria. - `Session.vector_search(table, vector_column, query_vector, top_k=10, metric="cosine")` - `Session.explain_vector_search(table, vector_column, query_vector, top_k=10, metric="cosine")` +### Ingestion shapes + +- preferred Arrow shape: `FixedSizeList` +- supported Python Arrow entrypoints: `Table`, `RecordBatch`, `RecordBatchReader`, and `__arrow_c_stream__` +- supported local CSV text shape: bracketed vectors such as `[1 2 3]` or `[1,2,3]` +- current CSV parser is still minimal; whitespace-separated bracketed vectors are the safest local format + ## Explain fields Current explain output contains: @@ -65,3 +74,23 @@ Current explain output contains: - Dimension mismatch rejection. - Python API shape and argument validation. - Arrow `FixedSizeList` ingestion fast path coverage. +- CSV bracketed vector ingestion coverage. + +## Benchmark baseline + +Repository entrypoints: + +- C++ vector benchmark: + - `bazel run //:vector_search_benchmark` +- stable benchmark wrapper: + - `./scripts/run_vector_search_benchmark.sh` + +The script uses the benchmark binary's `--quick` preset so repository verification stays lightweight. Use the raw Bazel target for the full baseline sweep. + +The benchmark baseline is still intentionally narrow: + +- local exact-scan only +- query metrics: `cosine`, `dot`, `l2` +- transport roundtrip coverage for proto-like, binary row batch, and actor-rpc control payloads +- no ANN comparisons +- no distributed claims diff --git a/docs/runtime-contract.md b/docs/runtime-contract.md new file mode 100644 index 0000000..560c26f --- /dev/null +++ b/docs/runtime-contract.md @@ -0,0 +1,254 @@ +# Velaria Runtime Contract + +## Summary + +This document defines the stable runtime-facing contract for Velaria's local kernel. + +It is the source of truth for: + +- progress fields +- checkpoint fields and delivery modes +- explain structure +- execution-mode reporting +- source/sink lifecycle semantics +- Python ecosystem mapping rules +- vector explain surface + +This document complements: + +- [core-boundary.md](./core-boundary.md) +- [streaming_runtime_design.md](./streaming_runtime_design.md) +- [local_vector_search_v01.md](./local_vector_search_v01.md) + +## Public Contract Surfaces + +Core C++ surfaces: + +- `DataflowSession` +- `DataFrame` +- `StreamingDataFrame` +- `StreamingQueryProgress` +- source/sink ABI in `stream/source_sink_abi.h` + +Python ecosystem projections: + +- `Session.stream_sql(...)` +- `Session.explain_stream_sql(...)` +- `Session.start_stream_sql(...)` +- `Session.vector_search(...)` +- `Session.explain_vector_search(...)` +- `Session.create_dataframe_from_arrow(...)` +- `Session.create_stream_from_arrow(...)` +- `read_excel(...)` + +Python-facing APIs may keep Pythonic naming, but their behavior must project the same semantics as the C++ kernel. + +## StreamingQueryProgress + +The following fields are treated as stable contract output: + +- identity and status + - `query_id` + - `status` +- execution choice + - `requested_execution_mode` + - `execution_mode` + - `execution_reason` + - `transport_mode` +- throughput and queueing + - `batches_pulled` + - `batches_processed` + - `blocked_count` + - `max_backlog_batches` + - `inflight_batches` + - `inflight_partitions` +- latency + - `last_batch_latency_ms` + - `last_sink_latency_ms` + - `last_state_latency_ms` +- source and strategy shape + - `last_source_offset` + - `backpressure_active` + - `actor_eligible` + - `used_actor_runtime` + - `used_shared_memory` + - `has_stateful_ops` + - `has_window` + - `sink_is_blocking` + - `source_is_bounded` +- estimates and thresholds + - `estimated_partitions` + - `projected_payload_bytes` + - `sampled_batches` + - `sampled_rows_per_batch` + - `average_projected_payload_bytes` + - `actor_speedup` + - `compute_to_overhead_ratio` + - `estimated_state_size_bytes` + - `estimated_batch_cost` + - `backpressure_max_queue_batches` + - `backpressure_high_watermark` + - `backpressure_low_watermark` +- checkpoint + - `checkpoint_delivery_mode` + +Field names must remain stable in both `StreamingQueryProgress` and Python dictionary projections unless there is an intentional versioned migration. + +## snapshotJson() + +`snapshotJson()` is a serialized projection of the same progress contract, not a separate contract family. + +Rules: + +- it must expose the same stable field names where available +- it must not rename execution, checkpoint, or vector-related fields independently +- Python and docs must treat it as a serialized view of `StreamingQueryProgress` + +## Explain Contract + +### Stream SQL Explain + +`explainStreamSql(...)` must return exactly three top-level sections: + +- `logical` +- `physical` +- `strategy` + +The `strategy` section must be the single explanation outlet for: + +- selected mode +- fallback / downgrade reason +- actor hot-path eligibility +- transport mode +- state and batch estimates +- shared-memory knobs +- checkpoint delivery mode +- backpressure threshold snapshot + +If runtime behavior changes, `strategy` text and `StreamingQueryProgress` must change together. + +### Vector Explain + +Vector explain output is a stable local capability contract. + +Current required fields: + +- `mode=exact-scan` +- `metric=` +- `dimension=` +- `top_k=` +- `candidate_rows=` +- `filter_pushdown=false` +- `acceleration=flat-buffer+heap-topk` + +Python `Session.explain_vector_search(...)` and CLI output must project this same core explain behavior. + +Preferred vector ingestion contract: + +- Arrow uses `FixedSizeList` as the first-class shape +- local CSV uses bracketed vector text such as `[1 2 3]` or `[1,2,3]` +- supported v0.1 execution remains local exact scan only + +## Execution Modes + +Stable execution-mode contract fields: + +- `requested_execution_mode` +- `execution_mode` +- `execution_reason` +- `transport_mode` + +Semantics: + +- requested mode records what the caller asked for +- execution mode records what the kernel actually used +- execution reason records fallback, downgrade, or final selection reason +- transport mode records the data-plane transport used by the kernel + +The planner/explain path and runtime path must not describe different decisions. + +## Checkpoint Delivery + +Stable checkpoint contract: + +- `at-least-once` + - default mode + - allows replay and duplicate sink output + - source offset is not restored by default +- `best-effort` + - restores source offset only when the source implements restore support + - still does not claim exactly-once sink semantics + +Stable checkpoint-related outputs: + +- `checkpoint_delivery_mode` +- `last_source_offset` + +Checkpoint files remain local and atomically replaced. + +## Source/Sink Lifecycle + +The stable lifecycle is: + +```text +open -> nextBatch -> checkpoint -> ack -> close +``` + +Rules: + +- source and sink open with query-scoped context +- checkpoint markers carry source offset and batch progress +- source receives `ack(token)` after checkpointed progress is recorded +- lifecycle semantics are query-local and do not claim distributed coordination + +## Python Ecosystem Mapping Rules + +Python is allowed to: + +- wrap core APIs +- package and distribute bindings +- offer ecosystem-friendly names +- provide Arrow/Excel/Bitable/custom source entrypoints +- compose demos and helper scripts + +Python is not allowed to: + +- invent a separate progress schema +- invent a separate checkpoint contract +- implement a separate vector-scoring semantic for supported CLI/API paths +- treat experimental runtime behavior as a required dependency + +## Build and Toolchain Contract + +Python ecosystem build rules require: + +- `uv` for repo-level Python commands +- a local CPython interpreter with `Python.h` +- `VELARIA_PYTHON_BIN` when Bazel cannot discover a usable interpreter automatically + +Repository entrypoints: + +- `./scripts/run_core_regression.sh` +- `./scripts/run_python_ecosystem_regression.sh` +- `./scripts/run_experimental_regression.sh` +- `./scripts/run_python_ci_checks.sh` + +## Regression Anchors + +The following targets anchor this contract: + +- core + - `//:sql_regression_test` + - `//:stream_runtime_test` + - `//:source_sink_abi_test` + - `//:stream_strategy_explain_test` + - `//:vector_runtime_test` +- python ecosystem + - `//python_api:streaming_v05_test` + - `//python_api:arrow_stream_ingestion_test` + - `//python_api:python_cli_contract_test` + - `//python_api:vector_search_test` + - `//python_api:read_excel_test` + - `//python_api:custom_stream_source_test` + - `//python_api:bitable_stream_source_test` + - `//python_api:bitable_group_by_owner_integration_test` diff --git a/docs/streaming_runtime_design.md b/docs/streaming_runtime_design.md index fd69611..9893ee8 100644 --- a/docs/streaming_runtime_design.md +++ b/docs/streaming_runtime_design.md @@ -1,5 +1,7 @@ # Streaming Runtime Design +This document describes runtime internals and current implementation shape. For the stable repository-facing contract, see [runtime-contract.md](./runtime-contract.md). For repository layering and ownership boundaries, see [core-boundary.md](./core-boundary.md). + ## Scope 当前文档只描述 Velaria 单机 `StreamingQuery` 路径,以及其与本机 actor-stream 运行时的接合点。 @@ -182,7 +184,7 @@ actor-stream payload 当前使用 typed binary batch: - query 级 `Auto` 当前阈值更接近“保守正确”,还不是最终调优状态。 - `split_ms / merge_ms` 仍是毫秒级指标,对极短阶段不够敏感。 - SQL 路径仍未自动下推到 actor runtime;当前优化主要落在 streaming 执行内核。 -- 同机 observability 仍是 experiment profile,不是完整 dashboard / distributed telemetry 体系。 +- 同机 observability 仍是 experiment profile,不是完整 distributed telemetry 体系。 ## Recommended Next Steps diff --git a/python_api/BUILD.bazel b/python_api/BUILD.bazel index 9992fb8..a21947e 100644 --- a/python_api/BUILD.bazel +++ b/python_api/BUILD.bazel @@ -33,15 +33,36 @@ py_library( py_binary( name = "demo_stream_sql", - srcs = ["demo_stream_sql.py"], - main = "demo_stream_sql.py", + srcs = ["examples/demo_stream_sql.py"], + main = "examples/demo_stream_sql.py", deps = [":velaria_py_pkg"], ) py_binary( name = "demo_batch_sql_arrow", - srcs = ["demo_batch_sql_arrow.py"], - main = "demo_batch_sql_arrow.py", + srcs = ["examples/demo_batch_sql_arrow.py"], + main = "examples/demo_batch_sql_arrow.py", + deps = [":velaria_py_pkg"], +) + +py_binary( + name = "demo_bitable_group_by_owner", + srcs = ["examples/demo_bitable_group_by_owner.py"], + main = "examples/demo_bitable_group_by_owner.py", + deps = [":velaria_py_pkg"], +) + +py_binary( + name = "demo_vector_search", + srcs = ["examples/demo_vector_search.py"], + main = "examples/demo_vector_search.py", + deps = [":velaria_py_pkg"], +) + +py_binary( + name = "bench_arrow_ingestion", + srcs = ["benchmarks/bench_arrow_ingestion.py"], + main = "benchmarks/bench_arrow_ingestion.py", deps = [":velaria_py_pkg"], ) @@ -49,6 +70,15 @@ py_binary( name = "velaria_cli", srcs = ["velaria_cli.py"], main = "velaria_cli.py", + deps = [ + ":velaria_cli_lib", + ], +) + +py_library( + name = "velaria_cli_lib", + srcs = ["velaria_cli.py"], + imports = ["."], deps = [":velaria_py_pkg"], ) @@ -64,6 +94,30 @@ filegroup( visibility = ["//visibility:public"], ) +filegroup( + name = "velaria_python_supported_sources", + srcs = [ + ":velaria_python_sources", + "velaria_cli.py", + ], + visibility = ["//visibility:public"], +) + +filegroup( + name = "velaria_python_example_sources", + srcs = glob([ + "examples/**/*.py", + "benchmarks/**/*.py", + ]), + visibility = ["//visibility:public"], +) + +filegroup( + name = "velaria_python_experimental_sources", + srcs = ["experimental/README.md"], + visibility = ["//visibility:public"], +) + py_wheel( name = "velaria_whl", distribution = "velaria", @@ -175,3 +229,28 @@ py_test( requirement("pyarrow"), ], ) + +py_test( + name = "python_cli_contract_test", + srcs = ["tests/test_python_cli_contract.py"], + main = "tests/test_python_cli_contract.py", + imports = ["."], + deps = [ + ":velaria_cli_lib", + ":velaria_py_pkg", + ], +) + +test_suite( + name = "velaria_python_supported_regression", + tests = [ + ":arrow_stream_ingestion_test", + ":bitable_group_by_owner_integration_test", + ":bitable_stream_source_test", + ":custom_stream_source_test", + ":python_cli_contract_test", + ":read_excel_test", + ":streaming_v05_test", + ":vector_search_test", + ], +) diff --git a/python_api/README.md b/python_api/README.md index 30715f9..e406e3b 100644 --- a/python_api/README.md +++ b/python_api/README.md @@ -1,148 +1,217 @@ -# Velaria Python API +# Velaria Python Ecosystem -Python API package for the Velaria dataflow engine. +This document is the entrypoint for Velaria's supported Python ecosystem layer. -Notes: +Python is a supported ingress, interop, and packaging surface. It is not the execution core. Core semantics still come from the native kernel and the runtime contract in [docs/runtime-contract.md](../docs/runtime-contract.md). -- Dependency management and demo execution use `uv`. -- The pure-Python wheel is built by Bazel target `//python_api:velaria_whl`. -- The native extension is built separately by Bazel target `//:velaria_pyext`. -- Bazel runtime loading uses `//python_api:velaria_py_pkg`, which packages the Python sources together with a package-local `_velaria.so`. -- `python_api/pyproject.toml` also declares `velaria/_velaria.so` as package data so setuptools/uv packaging will include the native module whenever that file is present in the package tree. -- Runtime loading prefers the package-local extension, then auto-discovers `bazel-bin/_velaria.so` in a source checkout. -- Version bumps should use `./scripts/bump_velaria_version.sh `, which updates the Bazel version source, Python package version source, and refreshes `uv.lock`. +## Scope -Quick start: +### Supported -```bash -bazel build //:velaria_pyext -uv sync --project python_api --python python3.12 -uv run --project python_api python python_api/demo_batch_sql_arrow.py -uv run --project python_api python python_api/demo_stream_sql.py -``` +The supported Python ecosystem includes: -Single-file CLI packaging (Python deps + native `_velaria.so`): +- the `velaria/` package and `Session` API +- Arrow ingestion and Arrow output +- `uv`-based local workflow +- native extension build +- wheel / native wheel packaging +- the supported CLI entrypoint `velaria_cli.py` +- Excel ingestion via `read_excel(...)` +- Bitable adapters and stream source integration +- custom source / custom sink adapters +- vector search and vector explain APIs -```bash -./scripts/build_py_cli_executable.sh -./dist/velaria-cli csv-sql --csv /path/to/input.csv --query "SELECT * FROM input_table LIMIT 5" -./dist/velaria-cli vector-search --csv /path/to/vectors.csv --vector-column embedding --query-vector "0.1,0.2,0.3" --metric cosine --top-k 5 -``` +### Examples -Python Session API for local vector search: +Examples and helper assets include: -```python -from velaria import Session +- `examples/demo_batch_sql_arrow.py` +- `examples/demo_stream_sql.py` +- `examples/demo_bitable_group_by_owner.py` +- `examples/demo_vector_search.py` +- `benchmarks/bench_arrow_ingestion.py` +- local ecosystem scripts and skills -session = Session() -# assume a temp view named "vec_src" already exists -out = session.vector_search("vec_src", "embedding", [0.1, 0.2, 0.3], top_k=5, metric="dot") -print(out.to_rows()) -print(session.explain_vector_search("vec_src", "embedding", [0.1, 0.2, 0.3], top_k=5, metric="dot")) -``` +### Experimental -Current vector search scope is local exact scan only (`cosine`/`dot`/`l2`) on fixed-dimension float vectors. +The Python experimental area is currently reserved under `experimental/`. -Native binary CLI alternative (runtime does not require Python environment): +Anything placed there is explicitly outside the supported ecosystem surface until it is promoted into `velaria/`, `velaria_cli.py`, or a supported adapter module. -```bash -bazel build //:velaria_cli -./bazel-bin/velaria_cli --csv /path/to/input.csv --query "SELECT * FROM input_table LIMIT 5" -./bazel-bin/velaria_cli --csv /path/to/vectors.csv --vector-column embedding --query-vector "0.1,0.2,0.3" --metric l2 --top-k 5 -``` +### Not In Scope + +Python does not define: + +- execution hot-path semantics +- a separate progress schema +- a separate checkpoint contract +- a separate vector scoring implementation for supported APIs +- Python UDFs in the hot path + +## API Surface -## CI packaging +Main `Session` API: -PR CI builds and uploads two native wheel variants: +- `Session.read_csv(...)` +- `Session.sql(...)` +- `Session.create_dataframe_from_arrow(...)` +- `Session.create_stream_from_arrow(...)` +- `Session.create_temp_view(...)` +- `Session.read_stream_csv_dir(...)` +- `Session.stream_sql(...)` +- `Session.explain_stream_sql(...)` +- `Session.start_stream_sql(...)` +- `Session.vector_search(...)` +- `Session.explain_vector_search(...)` -- manylinux wheel from the Linux job -- macOS wheel from the macOS job +Additional ecosystem helpers: -The Linux path uses `auditwheel repair` after building `//python_api:velaria_native_whl`. The macOS path uploads the Bazel-built native wheel directly. +- `read_excel(...)` +- `CustomArrowStreamSource` +- `CustomArrowStreamSink` +- `create_stream_from_custom_source(...)` +- `consume_arrow_batches_with_custom_sink(...)` -Tag-based release publishing is separate: +Mapping rule: -- bump the package version with `./scripts/bump_velaria_version.sh ` -- create a matching Git tag such as `v0.1.1` -- the release workflow verifies the tag matches `velaria.__version__` and publishes Linux and macOS wheel assets +- Python names may be ecosystem-friendly +- behavior must map back to the same native kernel contract exposed by C++ +## Repository Layout -## v0.5 Python 用例与测试 +Stable Python layout in this repo: -- `python_api/demo_batch_sql_arrow.py`:Arrow batch + SQL 临时视图路径。 -- `python_api/demo_stream_sql.py`:stream SQL + sink 路径。 -- `python_api/bench_arrow_ingestion.py`:对比 table / `RecordBatchReader` / `__arrow_c_stream__` ingestion 路径。 -- `bazel test //python_api:streaming_v05_test`:自动化覆盖 Arrow 输入、stream SQL 启动、progress 合同字段。 -- `Session.explain_stream_sql(...)`:直接返回 `logical / physical / strategy` 三段 explain 文本。 -- `bazel test //python_api:arrow_stream_ingestion_test`:自动化覆盖 `RecordBatchReader`、`__arrow_c_stream__` 和 stream batch 边界。 +- supported library: + - `python_api/velaria/` +- supported CLI tool: + - `python_api/velaria_cli.py` +- examples: + - `python_api/examples/` +- benchmarks: + - `python_api/benchmarks/` +- reserved experimental area: + - `python_api/experimental/` +- regression tests: + - `python_api/tests/` -建议本地顺序: +## Toolchain and Environment + +Repository Python commands use `uv`. + +Recommended local baseline: + +- CPython `3.12` +- `uv` +- local CPython headers (`Python.h`) + +Bazel Python detection currently probes local CPython interpreters in the `3.9` to `3.13` range. If auto-discovery fails, set: ```bash -bazel build //:velaria_pyext -bazel test //python_api:streaming_v05_test -bazel test //python_api:arrow_stream_ingestion_test -uv run --project python_api python python_api/demo_batch_sql_arrow.py -uv run --project python_api python python_api/demo_stream_sql.py -uv run --project python_api python python_api/bench_arrow_ingestion.py +export VELARIA_PYTHON_BIN=/path/to/python3.12 ``` +That interpreter must expose `Python.h`; otherwise Bazel cannot build the native extension. -## Custom Stream Source(Python) +## Development Workflow -现在 Python API 提供可复用的 custom stream source 适配: +Bootstrap: -- `CustomArrowStreamSource`:把 Python 行数据转换成 Arrow micro-batches。 -- `Session.create_dataframe_from_arrow(...)` / `Session.create_stream_from_arrow(...)` 现在优先接受 `RecordBatchReader` 和实现 `__arrow_c_stream__` 的对象,再回退到 `Table / RecordBatch / batch sequence`。 -- 默认 emit 策略:`1 秒` 或 `1024 行` 触发一次 batch(可配置)。 -- `create_stream_from_custom_source(session, rows, ...)`:直接转换并调用 `session.create_stream_from_arrow(...)`。 -- `CustomArrowStreamSink`:消费 Arrow micro-batches,并按“1秒或N条”聚合后触发 `on_emit` 回调。 +```bash +bazel build //:velaria_pyext +uv sync --project python_api --python python3.12 +``` -示例: +Run demos: -```python -from velaria import ( - CustomArrowStreamSink, - CustomArrowStreamSource, - consume_arrow_batches_with_custom_sink, -) +```bash +uv run --project python_api python python_api/examples/demo_batch_sql_arrow.py +uv run --project python_api python python_api/examples/demo_stream_sql.py +uv run --project python_api python python_api/examples/demo_vector_search.py +``` -source = CustomArrowStreamSource(emit_interval_seconds=1.0, emit_rows=500) -stream_df = source.to_stream_dataframe(session, rows_iterable) +Recommended regression entrypoint: -sink = CustomArrowStreamSink(lambda table: print(table.num_rows), emit_interval_seconds=1.0, emit_rows=500) -consume_arrow_batches_with_custom_sink([stream_df_batch_1, stream_df_batch_2], sink) +```bash +./scripts/run_python_ecosystem_regression.sh ``` -测试: +That script covers: + +- native extension build +- wheel and native wheel build +- Bazel Python regression targets +- demo smoke +- CLI smoke + +## Packaging + +Build targets: + +- native extension: + - `//:velaria_pyext` +- pure-Python wheel wrapper: + - `//python_api:velaria_whl` +- native wheel: + - `//python_api:velaria_native_whl` +- Python CLI: + - `//python_api:velaria_cli` + +Single-file CLI packaging: ```bash -bazel test //python_api:custom_stream_source_test -# 该测试同时覆盖 custom source 与 custom sink 的 emit 逻辑 +./scripts/build_py_cli_executable.sh +./dist/velaria-cli csv-sql \ + --csv /path/to/input.csv \ + --query "SELECT * FROM input_table LIMIT 5" ``` +The CLI is part of the ecosystem layer. For supported paths, it should delegate to the same native session contract as Python and C++. + +Python ecosystem source groups: + +- supported: + - `//python_api:velaria_python_supported_sources` +- examples and benchmarks: + - `//python_api:velaria_python_example_sources` +- experimental placeholder: + - `//python_api:velaria_python_experimental_sources` + +## Arrow Contract + +Supported Arrow ingestion inputs: -`Session.start_stream_sql(...)` 额外支持 `checkpoint_delivery_mode` 参数: +- `pyarrow.Table` +- `pyarrow.RecordBatch` +- `pyarrow.RecordBatchReader` +- objects implementing `__arrow_c_stream__` +- Python sequences of Arrow batches -- `at-least-once`(默认) -- `best-effort` +Vector-preferred Arrow shape: -`Session.explain_stream_sql(...)` 复用同一组选项参数: +- `FixedSizeList` -- `sql` -- `trigger_interval_ms` -- `checkpoint_path` -- `checkpoint_delivery_mode` +Preferred local CSV vector text shape: -### XLSX 数据读取 +- `[1 2 3]` +- `[1,2,3]` -仓库里已提供 `velaria.read_excel(...)` 直接读 `.xlsx`: +Current vector search scope: -- 先调用 `pandas.read_excel` -- 再转成 `pyarrow.Table` -- 再通过 `Session.create_dataframe_from_arrow(...)` 变成 Velaria DataFrame +- local exact scan only +- metrics: `cosine`, `dot`, `l2` +- no ANN / distributed execution / standalone vector DB behavior -示例: +## Excel, Bitable, and Custom Streams + +### Excel + +`read_excel(...)` reads `.xlsx` through: + +1. `pandas.read_excel` +2. `pyarrow.Table` conversion +3. `Session.create_dataframe_from_arrow(...)` + +Example: ```python from velaria import Session, read_excel @@ -150,12 +219,52 @@ from velaria import Session, read_excel session = Session() df = read_excel(session, "/path/to/file.xlsx", sheet_name="Sheet1") session.create_temp_view("staff", df) -out = session.sql("SELECT * FROM staff LIMIT 5") -print(out.to_rows()) +print(session.sql("SELECT * FROM staff LIMIT 5").to_rows()) ``` -该能力依赖运行时安装 `pandas` 与 `openpyxl`(已作为 Python 包依赖): +### Bitable and Custom Streams -```bash -uv run python -c "import pandas, openpyxl" -``` +Supported ecosystem integrations include: + +- Bitable-backed stream source flows +- custom Arrow stream sources +- custom Arrow stream sinks + +These are supported as ecosystem integrations, not as alternate execution cores. + +## Regression Matrix + +Python ecosystem regression targets: + +- `//python_api:streaming_v05_test` +- `//python_api:arrow_stream_ingestion_test` +- `//python_api:vector_search_test` +- `//python_api:read_excel_test` +- `//python_api:custom_stream_source_test` +- `//python_api:bitable_stream_source_test` +- `//python_api:bitable_group_by_owner_integration_test` + +Python-layer grouped suite: + +- `//python_api:velaria_python_supported_regression` + +Root-level grouped suite: + +- `//:python_ecosystem_regression` + +## Relation to Core + +Python may: + +- wrap +- package +- automate +- project ecosystem-friendly names + +Python may not: + +- redefine progress/checkpoint/explain semantics +- become the source of truth for runtime decisions +- introduce a second vector-search implementation for supported interfaces + +For core boundaries, see [docs/core-boundary.md](../docs/core-boundary.md). For stable runtime semantics, see [docs/runtime-contract.md](../docs/runtime-contract.md). diff --git a/python_api/bench_arrow_ingestion.py b/python_api/benchmarks/bench_arrow_ingestion.py similarity index 100% rename from python_api/bench_arrow_ingestion.py rename to python_api/benchmarks/bench_arrow_ingestion.py diff --git a/python_api/examples/README.md b/python_api/examples/README.md new file mode 100644 index 0000000..1fa4045 --- /dev/null +++ b/python_api/examples/README.md @@ -0,0 +1,14 @@ +# Python Ecosystem Examples + +This directory contains Python ecosystem examples and smoke assets. + +These files demonstrate supported surfaces, but they are not themselves part of the stable public API. They should call into the supported `velaria` package and the supported CLI contract instead of re-implementing kernel behavior. + +Current examples: + +- `demo_batch_sql_arrow.py` +- `demo_stream_sql.py` +- `demo_bitable_group_by_owner.py` +- `demo_vector_search.py` + +Benchmarks live in [../benchmarks](../benchmarks). diff --git a/python_api/demo_batch_sql_arrow.py b/python_api/examples/demo_batch_sql_arrow.py similarity index 100% rename from python_api/demo_batch_sql_arrow.py rename to python_api/examples/demo_batch_sql_arrow.py diff --git a/python_api/demo_bitable_group_by_owner.py b/python_api/examples/demo_bitable_group_by_owner.py similarity index 100% rename from python_api/demo_bitable_group_by_owner.py rename to python_api/examples/demo_bitable_group_by_owner.py diff --git a/python_api/demo_stream_sql.py b/python_api/examples/demo_stream_sql.py similarity index 100% rename from python_api/demo_stream_sql.py rename to python_api/examples/demo_stream_sql.py diff --git a/python_api/examples/demo_vector_search.py b/python_api/examples/demo_vector_search.py new file mode 100644 index 0000000..00b631f --- /dev/null +++ b/python_api/examples/demo_vector_search.py @@ -0,0 +1,59 @@ +import json + +import pyarrow as pa + +from velaria import Session + + +def main(): + session = Session() + vectors = pa.FixedSizeListArray.from_arrays( + pa.array( + [ + 1.0, + 0.0, + 0.0, + 0.9, + 0.1, + 0.0, + 0.0, + 1.0, + 0.0, + ], + type=pa.float32(), + ), + 3, + ) + table = pa.table({"id": [1, 2, 3], "embedding": vectors}) + df = session.create_dataframe_from_arrow(table) + session.create_temp_view("vec_demo", df) + + result = session.vector_search( + table="vec_demo", + vector_column="embedding", + query_vector=[1.0, 0.0, 0.0], + top_k=2, + metric="cosine", + ).to_arrow() + explain = session.explain_vector_search( + table="vec_demo", + vector_column="embedding", + query_vector=[1.0, 0.0, 0.0], + top_k=2, + metric="cosine", + ) + + print( + json.dumps( + { + "schema": result.schema.names, + "rows": result.to_pylist(), + "explain": explain, + }, + indent=2, + ) + ) + + +if __name__ == "__main__": + main() diff --git a/python_api/experimental/README.md b/python_api/experimental/README.md new file mode 100644 index 0000000..c9f7eed --- /dev/null +++ b/python_api/experimental/README.md @@ -0,0 +1,11 @@ +# Python Ecosystem Experimental Area + +This directory is reserved for Python-side experiments that are not part of the supported ecosystem surface. + +Rules: + +- do not place stable `Session` or runtime contract projections here +- do not introduce alternate execution semantics here +- anything added here should stay out of the default Python ecosystem regression until promoted + +The current repository does not ship supported Python modules from this directory. diff --git a/python_api/tests/test_python_cli_contract.py b/python_api/tests/test_python_cli_contract.py new file mode 100644 index 0000000..e55da7c --- /dev/null +++ b/python_api/tests/test_python_cli_contract.py @@ -0,0 +1,84 @@ +import io +import json +import pathlib +import tempfile +import unittest +from contextlib import redirect_stdout +from unittest import mock + +import velaria_cli + + +class _FakeArrowResult: + @property + def schema(self): + return mock.Mock(names=["row_id", "score"]) + + def to_pylist(self): + return [{"row_id": 0, "score": 0.0}] + + +class _FakeDataFrame: + def to_arrow(self): + return _FakeArrowResult() + + +class PythonCliContractTest(unittest.TestCase): + def test_vector_cli_delegates_to_session_contract(self): + fake_session = mock.Mock() + fake_session.read_csv.return_value = mock.Mock(name="df") + fake_session.vector_search.return_value = _FakeDataFrame() + fake_session.explain_vector_search.return_value = ( + "mode=exact-scan\n" + "metric=cosine\n" + "dimension=3\n" + "top_k=2\n" + "candidate_rows=3\n" + "filter_pushdown=false\n" + "acceleration=flat-buffer+heap-topk\n" + ) + + with tempfile.TemporaryDirectory(prefix="velaria-cli-contract-") as tmp: + csv_path = pathlib.Path(tmp) / "vectors.csv" + csv_path.write_text("id,embedding\n1,[1 0 0]\n", encoding="utf-8") + with mock.patch.object(velaria_cli, "Session", return_value=fake_session): + stdout = io.StringIO() + with redirect_stdout(stdout): + exit_code = velaria_cli._run_vector_search( + csv_path=csv_path, + vector_column="embedding", + query_vector="1.0,0.0,0.0", + metric="cosine", + top_k=2, + ) + + self.assertEqual(exit_code, 0) + fake_session.read_csv.assert_called_once_with(str(csv_path)) + fake_session.create_temp_view.assert_called_once() + fake_session.vector_search.assert_called_once_with( + table="input_table", + vector_column="embedding", + query_vector=[1.0, 0.0, 0.0], + top_k=2, + metric="cosine", + ) + fake_session.explain_vector_search.assert_called_once_with( + table="input_table", + vector_column="embedding", + query_vector=[1.0, 0.0, 0.0], + top_k=2, + metric="cosine", + ) + + payload = json.loads(stdout.getvalue()) + self.assertEqual(payload["metric"], "cosine") + self.assertEqual(payload["top_k"], 2) + self.assertEqual(payload["schema"], ["row_id", "score"]) + self.assertEqual(payload["rows"], [{"row_id": 0, "score": 0.0}]) + self.assertIn("mode=exact-scan", payload["explain"]) + self.assertIn("candidate_rows=3", payload["explain"]) + self.assertIn("filter_pushdown=false", payload["explain"]) + + +if __name__ == "__main__": + unittest.main() diff --git a/python_api/tests/test_streaming_v05.py b/python_api/tests/test_streaming_v05.py index 243a0cb..3cc07c7 100644 --- a/python_api/tests/test_streaming_v05.py +++ b/python_api/tests/test_streaming_v05.py @@ -54,6 +54,11 @@ def test_stream_progress_contract_with_start_stream_sql(self): self.assertEqual(progress["checkpoint_delivery_mode"], "best-effort") self.assertGreaterEqual(progress["batches_processed"], 1) self.assertIn(progress["source_is_bounded"], [True, False]) + self.assertIn("estimated_state_size_bytes", progress) + self.assertIn("estimated_batch_cost", progress) + self.assertIn("backpressure_max_queue_batches", progress) + self.assertIn("backpressure_high_watermark", progress) + self.assertIn("backpressure_low_watermark", progress) snapshot_json = json.dumps(progress) self.assertIn("execution_mode", snapshot_json) diff --git a/python_api/tests/test_vector_search.py b/python_api/tests/test_vector_search.py index 0d67740..111a131 100644 --- a/python_api/tests/test_vector_search.py +++ b/python_api/tests/test_vector_search.py @@ -43,7 +43,10 @@ def test_vector_search_metrics_and_explain(self): ) self.assertIn("mode=exact-scan", explain) self.assertIn("metric=cosine", explain) + self.assertIn("dimension=3", explain) self.assertIn("top_k=2", explain) + self.assertIn("candidate_rows=3", explain) + self.assertIn("filter_pushdown=false", explain) self.assertIn("acceleration=flat-buffer+heap-topk", explain) def test_vector_dimension_mismatch(self): @@ -62,6 +65,56 @@ def test_vector_dimension_mismatch(self): metric="l2", ) + def test_vector_search_fixed_size_list_record_batch_reader(self): + session = Session() + vectors = pa.FixedSizeListArray.from_arrays( + pa.array( + [ + 1.0, + 0.0, + 0.0, + 0.8, + 0.2, + 0.0, + 0.0, + 1.0, + 0.0, + ], + type=pa.float32(), + ), + 3, + ) + batch = pa.record_batch( + [pa.array([10, 20, 30], type=pa.int64()), vectors], + names=["id", "embedding"], + ) + reader = pa.RecordBatchReader.from_batches(batch.schema, [batch]) + df = session.create_dataframe_from_arrow(reader) + session.create_temp_view("vec_reader_py", df) + + result = session.vector_search( + table="vec_reader_py", + vector_column="embedding", + query_vector=[1.0, 0.0, 0.0], + top_k=2, + metric="cosine", + ).to_rows() + self.assertEqual(result["rows"][0][0], 0) + self.assertEqual(len(result["rows"]), 2) + + explain = session.explain_vector_search( + table="vec_reader_py", + vector_column="embedding", + query_vector=[1.0, 0.0, 0.0], + top_k=2, + metric="cosine", + ) + self.assertIn("mode=exact-scan", explain) + self.assertIn("dimension=3", explain) + self.assertIn("top_k=2", explain) + self.assertIn("candidate_rows=3", explain) + self.assertIn("filter_pushdown=false", explain) + if __name__ == "__main__": unittest.main() diff --git a/python_api/velaria/custom_stream.py b/python_api/velaria/custom_stream.py index b72eeb0..0747940 100644 --- a/python_api/velaria/custom_stream.py +++ b/python_api/velaria/custom_stream.py @@ -107,9 +107,11 @@ def __init__( self.options.validate() self._pending: List[pa.Table] = [] self._pending_rows = 0 - self._window_start = time.monotonic() + self._window_start: Optional[float] = None def write_batch(self, batch: pa.Table) -> None: + if self._window_start is None: + self._window_start = time.monotonic() self._pending.append(batch) self._pending_rows += batch.num_rows now = time.monotonic() @@ -128,7 +130,7 @@ def _emit_pending(self, now: float) -> None: self.on_emit(merged) self._pending = [] self._pending_rows = 0 - self._window_start = now + self._window_start = None if not self._pending else now def create_stream_from_custom_source( diff --git a/python_api/velaria_cli.py b/python_api/velaria_cli.py index f57fcd5..121ee15 100644 --- a/python_api/velaria_cli.py +++ b/python_api/velaria_cli.py @@ -1,8 +1,6 @@ import argparse import json -import math import pathlib -from typing import Iterable from velaria import Session @@ -36,37 +34,6 @@ def _parse_vector_text(text: str) -> list[float]: return [float(part.strip()) for part in value.split(",") if part.strip()] -def _extract_row_vector(raw_value) -> list[float]: - if isinstance(raw_value, (list, tuple)): - return [float(v) for v in raw_value] - return _parse_vector_text(str(raw_value)) - - -def _cosine_distance(lhs: Iterable[float], rhs: Iterable[float]) -> float: - lhs_values = list(lhs) - rhs_values = list(rhs) - dot = sum(a * b for a, b in zip(lhs_values, rhs_values)) - lhs_norm = math.sqrt(sum(a * a for a in lhs_values)) - rhs_norm = math.sqrt(sum(b * b for b in rhs_values)) - if lhs_norm == 0.0 or rhs_norm == 0.0: - return 1.0 - similarity = dot / (lhs_norm * rhs_norm) - similarity = max(-1.0, min(1.0, similarity)) - return 1.0 - similarity - - -def _l2_distance(lhs: Iterable[float], rhs: Iterable[float]) -> float: - lhs_values = list(lhs) - rhs_values = list(rhs) - return math.sqrt(sum((a - b) * (a - b) for a, b in zip(lhs_values, rhs_values))) - - -def _dot_score(lhs: Iterable[float], rhs: Iterable[float]) -> float: - lhs_values = list(lhs) - rhs_values = list(rhs) - return sum(a * b for a, b in zip(lhs_values, rhs_values)) - - def _run_vector_search( csv_path: pathlib.Path, vector_column: str, @@ -75,35 +42,32 @@ def _run_vector_search( top_k: int, ) -> int: session = Session() - table = session.read_csv(str(csv_path)).to_arrow() - rows = table.to_pylist() + df = session.read_csv(str(csv_path)) + session.create_temp_view("input_table", df) needle = _parse_vector_text(query_vector) if not needle: raise ValueError("--query-vector must not be empty") - scored = [] - expected_dim = len(needle) - for row_index, row in enumerate(rows): - if vector_column not in row: - raise KeyError(f"vector column not found: {vector_column}") - vector = _extract_row_vector(row[vector_column]) - if len(vector) != expected_dim: - raise ValueError( - f"fixed length vector mismatch at row {row_index}: expect {expected_dim}, got {len(vector)}" - ) - if metric in ("cosine", "cosin"): - distance = _cosine_distance(vector, needle) - elif metric == "dot": - distance = _dot_score(vector, needle) - else: - distance = _l2_distance(vector, needle) - scored.append({"row_index": row_index, "distance": distance, "row": row}) - - scored.sort(key=lambda item: item["distance"], reverse=(metric == "dot")) + result = session.vector_search( + table="input_table", + vector_column=vector_column, + query_vector=needle, + top_k=top_k, + metric=metric, + ).to_arrow() + explain = session.explain_vector_search( + table="input_table", + vector_column=vector_column, + query_vector=needle, + top_k=top_k, + metric=metric, + ) payload = { "metric": "cosine" if metric in ("cosine", "cosin") else metric, "top_k": top_k, - "rows": scored[:top_k], + "schema": result.schema.names, + "rows": result.to_pylist(), + "explain": explain, } print(json.dumps(payload, indent=2, ensure_ascii=False)) return 0 @@ -144,7 +108,7 @@ def _build_parser() -> argparse.ArgumentParser: vector_search.add_argument( "--vector-column", required=True, - help="Vector column name. Row value format supports '1,2,3' or '[1,2,3]'.", + help="Vector column name. CSV row value format should use bracketed vectors like '[1 2 3]' or '[1,2,3]'.", ) vector_search.add_argument( "--query-vector", diff --git a/scripts/BUILD.bazel b/scripts/BUILD.bazel index 0afaec1..64764a6 100644 --- a/scripts/BUILD.bazel +++ b/scripts/BUILD.bazel @@ -1,8 +1,12 @@ exports_files([ "build_py_cli_executable.sh", - "build_dashboard_frontend.sh", "build_native_wheel.py", + "run_core_regression.sh", "run_actor_rpc_e2e.sh", + "run_experimental_regression.sh", + "run_python_ci_checks.sh", + "run_python_ecosystem_regression.sh", "run_actor_rpc_scheduler.sh", "run_stream_observability_regression.sh", + "run_vector_search_benchmark.sh", ]) diff --git a/scripts/build_dashboard_frontend.sh b/scripts/build_dashboard_frontend.sh deleted file mode 100755 index 69540de..0000000 --- a/scripts/build_dashboard_frontend.sh +++ /dev/null @@ -1,46 +0,0 @@ -#!/usr/bin/env bash - -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -PROJECT_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" -TS_SOURCE="${1:-${PROJECT_ROOT}/src/dataflow/runner/dashboard/app.ts}" -TS_OUTPUT="${2:-${PROJECT_ROOT}/src/dataflow/runner/dashboard/app.js}" -NPM_CACHE_DIR="${NPM_CONFIG_CACHE:-${TMPDIR:-/tmp}/cpp-dataflow-dashboard-npm-cache}" - -mkdir -p "${NPM_CACHE_DIR}" -export NPM_CONFIG_CACHE="${NPM_CACHE_DIR}" -export npm_config_cache="${NPM_CACHE_DIR}" - -if [[ ! -f "${TS_SOURCE}" ]]; then - echo "[dashboard-ts] source not found: ${TS_SOURCE}" >&2 - exit 1 -fi - -if [[ -f "${TS_OUTPUT}" && "${TS_OUTPUT}" -nt "${TS_SOURCE}" ]]; then - echo "[dashboard-ts] up to date: ${TS_OUTPUT}" - exit 0 -fi - -if command -v tsc >/dev/null 2>&1; then - TS_COMPILER="tsc" -elif node -e "require('typescript/package.json');" >/dev/null 2>&1; then - TS_COMPILER="node -e 'const ts = require(\"typescript\"); const fs=require(\"fs\"); const p=process.argv[1]; const out=process.argv[2]; const txt=fs.readFileSync(p,'\''utf8'\''); const r=ts.transpileModule(txt,{compilerOptions:{target:ts.ScriptTarget.ES2018,module:ts.ModuleKind.None}}); fs.writeFileSync(out,r.outputText);'" -else - TS_COMPILER="npx --yes --package=typescript@5.4.5 tsc" -fi - -if [[ "$(dirname "${TS_OUTPUT}")" != "." && ! -d "$(dirname "${TS_OUTPUT}")" ]]; then - mkdir -p "$(dirname "${TS_OUTPUT}")" -fi - -echo "[dashboard-ts] compiling ${TS_SOURCE}" -if [[ "${TS_COMPILER}" == tsc ]]; then - tsc --pretty false --target ES2018 --module none --outFile "${TS_OUTPUT}" "${TS_SOURCE}" -elif [[ "${TS_COMPILER}" == node* ]]; then - eval ${TS_COMPILER} "${TS_SOURCE}" "${TS_OUTPUT}" -else - ${TS_COMPILER} --pretty false --target ES2018 --module none --outFile "${TS_OUTPUT}" "${TS_SOURCE}" -fi - -echo "[dashboard-ts] generated ${TS_OUTPUT}" diff --git a/scripts/run_actor_rpc_e2e.sh b/scripts/run_actor_rpc_e2e.sh index 569c848..6620aca 100755 --- a/scripts/run_actor_rpc_e2e.sh +++ b/scripts/run_actor_rpc_e2e.sh @@ -13,7 +13,6 @@ Env: PAYLOAD default "demo payload" TIMEOUT_SECONDS default 20 DO_BUILD default 0 (set to 1 to run build first) - BUILD_DASHBOARD default 1 (set to 0 to skip dashboard frontend build) Examples: scripts/run_actor_rpc_e2e.sh @@ -27,7 +26,6 @@ WORKER_ID="${WORKER_ID:-worker-1}" PAYLOAD="${PAYLOAD:-demo payload}" TIMEOUT_SECONDS="${TIMEOUT_SECONDS:-20}" DO_BUILD="${DO_BUILD:-0}" -BUILD_DASHBOARD="${BUILD_DASHBOARD:-1}" MODE_SQL="" MODE_PAYLOAD="" @@ -66,10 +64,6 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" PROJECT_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" cd "${PROJECT_ROOT}" -if [[ "${BUILD_DASHBOARD}" != "0" ]]; then - bazel build //:dashboard_app_js -fi - if [[ "${DO_BUILD}" == "1" ]]; then bazel build //:actor_rpc_scheduler //:actor_rpc_worker //:actor_rpc_client //:actor_rpc_smoke fi @@ -89,23 +83,9 @@ wait_tcp_open() { local start start="$(date +%s)" while true; do - if python3 - "${host}" "${port}" <<'PY' >/dev/null 2>&1 -import socket -import sys - -host = sys.argv[1] -port = int(sys.argv[2]) - -try: - s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - s.settimeout(0.3) - s.connect((host, port)) - s.close() - print("ok") -except Exception: - raise SystemExit(1) -PY - then + if (exec 3<>"/dev/tcp/${host}/${port}") >/dev/null 2>&1; then + exec 3<&- + exec 3>&- return 0 fi if (( $(date +%s) - start >= timeout )); then diff --git a/scripts/run_actor_rpc_scheduler.sh b/scripts/run_actor_rpc_scheduler.sh index b021b5c..b2ed1af 100755 --- a/scripts/run_actor_rpc_scheduler.sh +++ b/scripts/run_actor_rpc_scheduler.sh @@ -9,10 +9,9 @@ Usage: Env: DO_BUILD default 0 (set to 1 to run bazel build first) - BUILD_DASHBOARD default 1 (set to 0 to skip bazel dashboard build) Examples: - scripts/run_actor_rpc_scheduler.sh -- --listen 127.0.0.1:61000 --node-id scheduler --dashboard-enabled --dashboard-listen 127.0.0.1:8080 + scripts/run_actor_rpc_scheduler.sh -- --listen 127.0.0.1:61000 --node-id scheduler EOF } @@ -26,11 +25,6 @@ PROJECT_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" cd "${PROJECT_ROOT}" DO_BUILD="${DO_BUILD:-0}" -BUILD_DASHBOARD="${BUILD_DASHBOARD:-1}" - -if [[ "${BUILD_DASHBOARD}" != "0" ]]; then - bazel build //:dashboard_app_js -fi if [[ "${DO_BUILD}" == "1" ]]; then bazel build //:actor_rpc_scheduler diff --git a/scripts/run_core_regression.sh b/scripts/run_core_regression.sh new file mode 100755 index 0000000..9d0dcb8 --- /dev/null +++ b/scripts/run_core_regression.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +cd "$ROOT" + +bazel test //:core_regression +bazel run //:sql_demo +bazel run //:df_demo +bazel run //:stream_demo + +echo "[summary] core regression ok" diff --git a/scripts/run_experimental_regression.sh b/scripts/run_experimental_regression.sh new file mode 100755 index 0000000..14431df --- /dev/null +++ b/scripts/run_experimental_regression.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +cd "$ROOT" + +bazel test //:experimental_regression +bazel run //:actor_rpc_smoke +./scripts/run_actor_rpc_e2e.sh --payload "experimental regression payload" + +echo "[summary] experimental regression ok" diff --git a/scripts/run_python_ci_checks.sh b/scripts/run_python_ci_checks.sh index 9288c64..fbcbd34 100755 --- a/scripts/run_python_ci_checks.sh +++ b/scripts/run_python_ci_checks.sh @@ -19,7 +19,28 @@ fi bazel build //:velaria_pyext uv sync --project python_api --python "${VELARIA_PYTHON_BIN}" +bazel test //:python_ecosystem_regression + +tmp_csv="$(mktemp "${TMPDIR:-/tmp}/velaria-python-ci-XXXXXX.csv")" +tmp_vec_csv="$(mktemp "${TMPDIR:-/tmp}/velaria-python-ci-vector-XXXXXX.csv")" +trap 'rm -f "$tmp_csv" "$tmp_vec_csv"' EXIT +printf 'id,name\n1,alice\n2,bob\n' >"$tmp_csv" +printf 'id,embedding\n1,[1 0 0]\n2,[0.9 0.1 0]\n3,[0 1 0]\n' >"$tmp_vec_csv" + +PYTHONPATH="${PYTHONPATH:-$(pwd)/python_api}" \ + uv run --project python_api python python_api/examples/demo_batch_sql_arrow.py +PYTHONPATH="${PYTHONPATH:-$(pwd)/python_api}" \ + uv run --project python_api python python_api/examples/demo_stream_sql.py PYTHONPATH="${PYTHONPATH:-$(pwd)/python_api}" \ - uv run --project python_api python python_api/demo_batch_sql_arrow.py + uv run --project python_api python python_api/velaria_cli.py \ + csv-sql \ + --csv "$tmp_csv" \ + --query "SELECT * FROM input_table LIMIT 1" PYTHONPATH="${PYTHONPATH:-$(pwd)/python_api}" \ - uv run --project python_api python python_api/demo_stream_sql.py + uv run --project python_api python python_api/velaria_cli.py \ + vector-search \ + --csv "$tmp_vec_csv" \ + --vector-column embedding \ + --query-vector "1.0,0.0,0.0" \ + --metric cosine \ + --top-k 2 diff --git a/scripts/run_python_ecosystem_regression.sh b/scripts/run_python_ecosystem_regression.sh new file mode 100755 index 0000000..d0e2e05 --- /dev/null +++ b/scripts/run_python_ecosystem_regression.sh @@ -0,0 +1,52 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +cd "$ROOT" + +if [[ -z "${VELARIA_PYTHON_BIN:-}" ]]; then + echo "VELARIA_PYTHON_BIN is required for python ecosystem regression" >&2 + exit 1 +fi + +if ! command -v "${VELARIA_PYTHON_BIN}" >/dev/null 2>&1; then + echo "VELARIA_PYTHON_BIN does not resolve to an executable: ${VELARIA_PYTHON_BIN}" >&2 + exit 1 +fi + +if ! command -v uv >/dev/null 2>&1; then + echo "uv is required for python ecosystem regression" >&2 + exit 1 +fi + +bazel build //:velaria_pyext //python_api:velaria_whl //python_api:velaria_native_whl //python_api:velaria_cli +bazel test //:python_ecosystem_regression + +uv sync --project python_api --python "${VELARIA_PYTHON_BIN}" +PYTHONPATH="${PYTHONPATH:-${ROOT}/python_api}" \ + uv run --project python_api python python_api/examples/demo_batch_sql_arrow.py +PYTHONPATH="${PYTHONPATH:-${ROOT}/python_api}" \ + uv run --project python_api python python_api/examples/demo_stream_sql.py + +tmp_csv="$(mktemp "${TMPDIR:-/tmp}/velaria-cli-XXXXXX.csv")" +tmp_vec_csv="$(mktemp "${TMPDIR:-/tmp}/velaria-cli-vector-XXXXXX.csv")" +trap 'rm -f "$tmp_csv" "$tmp_vec_csv"' EXIT +printf 'id,name\n1,alice\n2,bob\n' >"$tmp_csv" +printf 'id,embedding\n1,[1 0 0]\n2,[0.9 0.1 0]\n3,[0 1 0]\n' >"$tmp_vec_csv" + +PYTHONPATH="${PYTHONPATH:-${ROOT}/python_api}" \ + uv run --project python_api python python_api/velaria_cli.py \ + csv-sql \ + --csv "$tmp_csv" \ + --query "SELECT * FROM input_table LIMIT 1" + +PYTHONPATH="${PYTHONPATH:-${ROOT}/python_api}" \ + uv run --project python_api python python_api/velaria_cli.py \ + vector-search \ + --csv "$tmp_vec_csv" \ + --vector-column embedding \ + --query-vector "1.0,0.0,0.0" \ + --metric cosine \ + --top-k 2 + +echo "[summary] python ecosystem regression ok" diff --git a/scripts/run_vector_search_benchmark.sh b/scripts/run_vector_search_benchmark.sh new file mode 100755 index 0000000..5d0718a --- /dev/null +++ b/scripts/run_vector_search_benchmark.sh @@ -0,0 +1,81 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +cd "$ROOT" + +vector_output="$(mktemp)" +trap 'rm -f "$vector_output"' EXIT + +bazel run //:vector_search_benchmark -- --quick >"$vector_output" + +uv run python - "$vector_output" <<'PY' +import json +import pathlib +import sys + +path = pathlib.Path(sys.argv[1]) +lines = path.read_text().splitlines() + +query_profiles = [] +transport_profiles = [] +for line in lines: + if not line.startswith("{"): + continue + payload = json.loads(line) + bench = payload.get("bench") + if bench == "vector-query": + query_profiles.append(payload) + elif bench == "vector-transport": + transport_profiles.append(payload) + +if not query_profiles: + raise SystemExit("missing vector-query benchmark output") +if not transport_profiles: + raise SystemExit("missing vector-transport benchmark output") + +for item in query_profiles: + for key in [ + "bench", + "rows", + "dimension", + "top_k", + "metric", + "cold_query_us", + "warm_query_avg_us", + "warm_explain_avg_us", + "result_rows", + ]: + if key not in item: + raise SystemExit(f"vector-query benchmark JSON missing key: {key}") + if item["bench"] != "vector-query": + raise SystemExit("unexpected bench kind in vector-query payload") + if item["top_k"] != 10: + raise SystemExit("vector-query benchmark top_k drifted from v0.1 baseline") + if item["metric"] not in {"cosine", "dot", "l2"}: + raise SystemExit(f"unexpected vector metric in benchmark output: {item['metric']}") + if item["result_rows"] <= 0: + raise SystemExit("vector-query benchmark returned no rows") + +for item in transport_profiles: + for key in [ + "bench", + "rows", + "dimension", + "proto_serialize_us", + "proto_deserialize_us", + "proto_payload_bytes", + "binary_serialize_us", + "binary_deserialize_us", + "binary_payload_bytes", + "actor_rpc_encode_us", + "actor_rpc_decode_us", + "actor_rpc_control_bytes", + ]: + if key not in item: + raise SystemExit(f"vector-transport benchmark JSON missing key: {key}") + if item["bench"] != "vector-transport": + raise SystemExit("unexpected bench kind in vector-transport payload") + +print("[summary] vector search benchmark baseline ok") +PY diff --git a/src/dataflow/core/csv.cc b/src/dataflow/core/csv.cc index 8f2d35d..3caa11b 100644 --- a/src/dataflow/core/csv.cc +++ b/src/dataflow/core/csv.cc @@ -29,6 +29,12 @@ bool isDouble(const std::string& s) { Value parseCell(const std::string& cell) { if (cell.empty()) return Value(); + if (cell.size() >= 2 && cell.front() == '[' && cell.back() == ']') { + const auto vec = Value::parseFixedVector(cell); + if (!vec.empty()) { + return Value(vec); + } + } if (isInt(cell)) { return Value(static_cast(std::stoll(cell))); } diff --git a/src/dataflow/core/value.h b/src/dataflow/core/value.h index e3bf3b2..cb4ff8b 100644 --- a/src/dataflow/core/value.h +++ b/src/dataflow/core/value.h @@ -67,16 +67,14 @@ class Value { if (!text.empty() && text.front() == '[' && text.back() == ']') { text = text.substr(1, text.size() - 2); } + for (char& ch : text) { + if (ch == ',') ch = ' '; + } std::vector out; std::stringstream ss(text); - std::string token; - while (std::getline(ss, token, ',')) { - if (token.empty()) continue; - std::stringstream trim(token); - std::string cleaned; - trim >> cleaned; - if (cleaned.empty()) continue; - out.push_back(std::stof(cleaned)); + float value = 0.0f; + while (ss >> value) { + out.push_back(value); } return out; } diff --git a/src/dataflow/examples/actor_rpc_scheduler.cc b/src/dataflow/examples/actor_rpc_scheduler.cc index 525a08d..11f14a9 100644 --- a/src/dataflow/examples/actor_rpc_scheduler.cc +++ b/src/dataflow/examples/actor_rpc_scheduler.cc @@ -18,15 +18,6 @@ int main(int argc, char* argv[]) { config.node_id = argv[++i]; continue; } - if (std::string(argv[i]) == "--dashboard-listen" && i + 1 < argc) { - config.dashboard_listen_address = argv[++i]; - config.dashboard_enabled = true; - continue; - } - if (std::string(argv[i]) == "--dashboard-enabled") { - config.dashboard_enabled = true; - continue; - } if (std::string(argv[i]) == "--auto-worker") { config.auto_worker = true; continue; diff --git a/src/dataflow/examples/vector_search_benchmark.cc b/src/dataflow/examples/vector_search_benchmark.cc index 47a508f..376b0fc 100644 --- a/src/dataflow/examples/vector_search_benchmark.cc +++ b/src/dataflow/examples/vector_search_benchmark.cc @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -164,10 +165,31 @@ void runTransportCase(std::size_t rows, std::size_t dim) { } // namespace -int main() { - std::cout << "[vector-benchmark] exact scan regression baseline" << std::endl; - for (std::size_t rows : {10000ULL, 100000ULL}) { - for (std::size_t dim : {128ULL, 768ULL}) { +int main(int argc, char** argv) { + bool quick = false; + for (int i = 1; i < argc; ++i) { + const std::string arg = argv[i]; + if (arg == "--quick") { + quick = true; + continue; + } + if (arg == "-h" || arg == "--help") { + std::cout << "Usage: " << argv[0] << " [--quick]\n"; + std::cout << " --quick run a smaller exact-scan baseline for repo verification\n"; + return 0; + } + throw std::runtime_error("unknown argument: " + arg); + } + + const std::vector rows_cases = quick ? std::vector{10000ULL} + : std::vector{10000ULL, 100000ULL}; + const std::vector dim_cases = quick ? std::vector{128ULL} + : std::vector{128ULL, 768ULL}; + + std::cout << "[vector-benchmark] exact scan regression baseline" + << (quick ? " (quick)" : " (full)") << std::endl; + for (std::size_t rows : rows_cases) { + for (std::size_t dim : dim_cases) { runCase(rows, dim, dataflow::VectorDistanceMetric::Cosine, "cosine"); runCase(rows, dim, dataflow::VectorDistanceMetric::Dot, "dot"); runCase(rows, dim, dataflow::VectorDistanceMetric::L2, "l2"); diff --git a/src/dataflow/examples/velaria_cli.cc b/src/dataflow/examples/velaria_cli.cc index 0cd64ee..916527a 100644 --- a/src/dataflow/examples/velaria_cli.cc +++ b/src/dataflow/examples/velaria_cli.cc @@ -1,10 +1,7 @@ #include -#include -#include #include #include #include -#include #include "src/dataflow/api/session.h" #include "src/dataflow/core/value.h" @@ -49,6 +46,8 @@ std::string valueToJson(const dataflow::Value& value) { return value.toString(); case dataflow::DataType::String: return "\"" + escapeJson(value.asString()) + "\""; + case dataflow::DataType::FixedVector: + return "\"" + escapeJson(value.toString()) + "\""; } return "null"; } @@ -58,66 +57,32 @@ void printUsage(const char* program) { << " --csv [--query ] [--table ] [--delimiter ]\n" << " " << program << " --csv --vector-column --query-vector \n" - << " [--metric cosine|cosin|dot|l2] [--top-k ]\n"; + << " [--metric cosine|cosin|dot|l2] [--top-k ]\n" + << " vector CSV cells should use bracketed values like '[1 2 3]' or '[1,2,3]'\n"; } -std::vector parseVectorText(const std::string& raw) { +std::vector parseVectorText(const std::string& raw) { std::string input = raw; if (!input.empty() && input.front() == '[' && input.back() == ']') { input = input.substr(1, input.size() - 2); } - std::vector out; - std::stringstream ss(input); + std::vector out; std::string item; - while (std::getline(ss, item, ',')) { - if (item.empty()) continue; - out.push_back(std::stod(item)); + std::size_t start = 0; + while (start <= input.size()) { + const std::size_t end = input.find(',', start); + item = input.substr(start, end == std::string::npos ? std::string::npos : end - start); + if (!item.empty()) { + out.push_back(static_cast(std::stof(item))); + } + if (end == std::string::npos) { + break; + } + start = end + 1; } return out; } -std::vector parseVectorValue(const dataflow::Value& value) { - if (value.type() == dataflow::DataType::String) { - return parseVectorText(value.asString()); - } - return parseVectorText(value.toString()); -} - -double l2Distance(const std::vector& lhs, const std::vector& rhs) { - double sum = 0.0; - for (std::size_t i = 0; i < lhs.size(); ++i) { - const double diff = lhs[i] - rhs[i]; - sum += diff * diff; - } - return std::sqrt(sum); -} - -double cosineDistance(const std::vector& lhs, const std::vector& rhs) { - double dot = 0.0; - double lhs_norm = 0.0; - double rhs_norm = 0.0; - for (std::size_t i = 0; i < lhs.size(); ++i) { - dot += lhs[i] * rhs[i]; - lhs_norm += lhs[i] * lhs[i]; - rhs_norm += rhs[i] * rhs[i]; - } - if (lhs_norm == 0.0 || rhs_norm == 0.0) { - return 1.0; - } - double similarity = dot / (std::sqrt(lhs_norm) * std::sqrt(rhs_norm)); - if (similarity > 1.0) similarity = 1.0; - if (similarity < -1.0) similarity = -1.0; - return 1.0 - similarity; -} - -double dotScore(const std::vector& lhs, const std::vector& rhs) { - double dot = 0.0; - for (std::size_t i = 0; i < lhs.size(); ++i) { - dot += lhs[i] * rhs[i]; - } - return dot; -} - } // namespace int main(int argc, char** argv) { @@ -177,69 +142,42 @@ int main(int argc, char** argv) { try { auto& session = dataflow::DataflowSession::builder(); auto df = session.read_csv(csv_path, delimiter); + session.createTempView(table, df); if (vector_mode) { - auto result = df.toTable(); - if (!result.schema.has(vector_column)) { - throw std::runtime_error("vector column not found: " + vector_column); - } - const auto vector_index = result.schema.indexOf(vector_column); const auto needle = parseVectorText(query_vector); if (needle.empty()) { throw std::runtime_error("query vector cannot be empty"); } - - struct Candidate { - std::size_t row_index; - double distance; - }; - std::vector candidates; - candidates.reserve(result.rows.size()); - - for (std::size_t i = 0; i < result.rows.size(); ++i) { - auto vec = parseVectorValue(result.rows[i][vector_index]); - if (vec.size() != needle.size()) { - throw std::runtime_error("fixed length vector mismatch at row " + std::to_string(i)); - } - double distance = 0.0; - if (metric == "cosine" || metric == "cosin") { - distance = cosineDistance(vec, needle); - } else if (metric == "dot") { - distance = dotScore(vec, needle); - } else if (metric == "l2") { - distance = l2Distance(vec, needle); - } else { - throw std::runtime_error("unsupported metric: " + metric); - } - candidates.push_back(Candidate{i, distance}); - } - + dataflow::VectorDistanceMetric runtime_metric = dataflow::VectorDistanceMetric::Cosine; if (metric == "dot") { - std::sort(candidates.begin(), candidates.end(), [](const Candidate& lhs, const Candidate& rhs) { - return lhs.distance > rhs.distance; - }); - } else { - std::sort(candidates.begin(), candidates.end(), [](const Candidate& lhs, const Candidate& rhs) { - return lhs.distance < rhs.distance; - }); + runtime_metric = dataflow::VectorDistanceMetric::Dot; + } else if (metric == "l2") { + runtime_metric = dataflow::VectorDistanceMetric::L2; + } else if (metric != "cosine" && metric != "cosin") { + throw std::runtime_error("unsupported metric: " + metric); } - - const std::size_t emit = std::min(top_k, candidates.size()); + const auto result = + session.vectorQuery(table, vector_column, needle, top_k, runtime_metric).toTable(); + const auto explain = + session.explainVectorQuery(table, vector_column, needle, top_k, runtime_metric); std::cout << "{\n"; std::cout << " \"metric\": \"" << (metric == "cosin" ? "cosine" : metric) << "\",\n"; std::cout << " \"top_k\": " << top_k << ",\n"; + std::cout << " \"explain\": \"" << escapeJson(explain) << "\",\n"; std::cout << " \"rows\": [\n"; - for (std::size_t i = 0; i < emit; ++i) { - const auto& c = candidates[i]; - std::cout << " {\"row_index\": " << c.row_index << ", \"distance\": " << c.distance - << ", \"row\": ["; - const auto& row = result.rows[c.row_index]; + for (std::size_t i = 0; i < result.rows.size(); ++i) { + const auto& row = result.rows[i]; + std::cout << " {"; for (std::size_t j = 0; j < row.size(); ++j) { if (j > 0) std::cout << ", "; - std::cout << valueToJson(row[j]); + std::cout << "\"" << escapeJson(result.schema.fields[j]) << "\": " + << valueToJson(row[j]); + } + std::cout << "}"; + if (i + 1 < result.rows.size()) { + std::cout << ","; } - std::cout << "]}"; - if (i + 1 < emit) std::cout << ","; std::cout << "\n"; } std::cout << " ]\n"; @@ -247,7 +185,6 @@ int main(int argc, char** argv) { return 0; } - session.createTempView(table, df); auto result = session.sql(query).toTable(); std::cout << "{\n"; diff --git a/src/dataflow/python/python_module.cc b/src/dataflow/python/python_module.cc index 3a47cda..c74b0cd 100644 --- a/src/dataflow/python/python_module.cc +++ b/src/dataflow/python/python_module.cc @@ -919,6 +919,16 @@ PyObject* pyProgressFromNative(const df::StreamingQueryProgress& progress) { setDictItem(out, "actor_speedup", PyFloat_FromDouble(progress.actor_speedup)); setDictItem(out, "compute_to_overhead_ratio", PyFloat_FromDouble(progress.compute_to_overhead_ratio)); + setDictItem(out, "estimated_state_size_bytes", + PyLong_FromUnsignedLongLong(progress.estimated_state_size_bytes)); + setDictItem(out, "estimated_batch_cost", + PyLong_FromUnsignedLongLong(progress.estimated_batch_cost)); + setDictItem(out, "backpressure_max_queue_batches", + PyLong_FromUnsignedLongLong(progress.backpressure_max_queue_batches)); + setDictItem(out, "backpressure_high_watermark", + PyLong_FromUnsignedLongLong(progress.backpressure_high_watermark)); + setDictItem(out, "backpressure_low_watermark", + PyLong_FromUnsignedLongLong(progress.backpressure_low_watermark)); return out; } diff --git a/src/dataflow/runner/actor_runtime.cc b/src/dataflow/runner/actor_runtime.cc index 852b0bf..d3e6b7c 100644 --- a/src/dataflow/runner/actor_runtime.cc +++ b/src/dataflow/runner/actor_runtime.cc @@ -6,16 +6,13 @@ #include #include #include -#include #include #include #include -#include #include #include #include #include -#include #include #include #include @@ -106,11 +103,6 @@ std::string summarizeTable(const Table& table) { return out.str(); } -[[maybe_unused]] std::string nextDashboardJobId() { - static std::atomic seq{1}; - return "dashboard_job_" + std::to_string(seq.fetch_add(1)); -} - Table makeDemoInputTable(const std::string& payload) { Table table(Schema({"token", "bucket", "score"}), {}); const int64_t base = static_cast(payload.size()); @@ -142,282 +134,17 @@ DataFrame buildClientPlan(const std::string& payload, const std::string& sql) { return buildPayloadPlan(payload); } -std::string toLowerCopy(std::string value) { - for (char& ch : value) { - ch = static_cast(std::tolower(static_cast(ch))); - } - return value; -} - -std::string toUpperCopy(std::string value) { - for (char& ch : value) { - ch = static_cast(std::toupper(static_cast(ch))); - } - return value; -} - -std::string urlDecode(const std::string& input) { - std::string out; - out.reserve(input.size()); - auto hexToInt = [](char h) -> int { - if (h >= '0' && h <= '9') return h - '0'; - if (h >= 'a' && h <= 'f') return 10 + (h - 'a'); - if (h >= 'A' && h <= 'F') return 10 + (h - 'A'); - return -1; - }; - for (std::size_t i = 0; i < input.size(); ++i) { - const char ch = input[i]; - if (ch == '+') { - out.push_back(' '); - continue; - } - if (ch == '%' && i + 2 < input.size()) { - const int hi = hexToInt(input[i + 1]); - const int lo = hexToInt(input[i + 2]); - if (hi >= 0 && lo >= 0) { - out.push_back(static_cast((hi << 4) | lo)); - i += 2; - continue; - } - } - out.push_back(ch); - } - return out; -} - -std::unordered_map parseFormBody(const std::string& body) { - std::unordered_map out; - std::size_t start = 0; - while (start < body.size()) { - const std::size_t sep = body.find('&', start); - const std::string pair = body.substr(start, sep == std::string::npos ? std::string::npos : sep - start); - if (!pair.empty()) { - const std::size_t eq = pair.find('='); - if (eq == std::string::npos) { - out[urlDecode(pair)] = ""; - } else { - out[urlDecode(pair.substr(0, eq))] = urlDecode(pair.substr(eq + 1)); - } - } - if (sep == std::string::npos) break; - start = sep + 1; - } - return out; -} - -void sendHttpResponse(int fd, int code, const std::string& reason, const std::string& body, - const std::string& content_type) { - std::ostringstream out; - out << "HTTP/1.1 " << code << " " << reason << "\r\n" - << "Content-Type: " << content_type << "; charset=utf-8\r\n" - << "Content-Length: " << body.size() << "\r\n" - << "Access-Control-Allow-Origin: *\r\n" - << "Connection: close\r\n" - << "\r\n" - << body; - const std::string text = out.str(); - sendAllBytes(fd, reinterpret_cast(text.data()), text.size()); -} - bool isLocalWorkerNodeId(const std::string& node_id, const std::string& scheduler_node_id) { const std::string prefix = scheduler_node_id + "-worker-"; return node_id.compare(0, prefix.size(), prefix) == 0; } -bool readHttpRequest(int fd, std::string* method, std::string* path, std::string* body) { - if (!method || !path || !body) return false; - method->clear(); - path->clear(); - body->clear(); - - std::string request; - std::size_t content_length = 0; - std::size_t header_end = std::string::npos; - while (true) { - char buffer[4096]; - const ssize_t n = ::recv(fd, buffer, sizeof(buffer), 0); - if (n <= 0) return false; - request.append(buffer, n); - if (header_end == std::string::npos) { - header_end = request.find("\r\n\r\n"); - if (header_end == std::string::npos) { - if (request.size() > 64 * 1024) return false; - continue; - } - std::istringstream header_stream(request.substr(0, header_end)); - if (!(header_stream >> *method >> *path)) return false; - std::string version; - header_stream >> version; - if (version.empty()) return false; - std::string line; - while (std::getline(header_stream, line)) { - if (!line.empty() && line.back() == '\r') line.pop_back(); - const std::size_t split = line.find(':'); - if (split == std::string::npos) continue; - const std::string key = toLowerCopy(line.substr(0, split)); - const std::string value = line.substr(split + 1); - if (key == "content-length") { - try { - content_length = static_cast(std::stoul(value)); - } catch (...) { - content_length = 0; - } - } - } - *method = toUpperCopy(*method); - } - const std::size_t header_pos = header_end + 4; - if (request.size() < header_pos) return false; - const std::size_t body_available = request.size() - header_pos; - if (body_available < content_length) { - if (request.size() > 128 * 1024) return false; - continue; - } - body->assign(request.substr(header_pos, content_length)); - return true; - } -} - -std::string dashboardRoot() { - static const std::string root = [] { - auto fileExists = [](const std::string& path) { - return std::ifstream(path).good(); - }; - auto appendDash = [](const std::string& dir) { - return dir + "/src/dataflow/runner/dashboard"; - }; - - auto testCandidate = [&](const std::string& candidate) { - if (fileExists(candidate + "/index.html")) { - return candidate; - } - return std::string(); - }; - auto tryCandidate = [&](const std::string& candidate) -> std::string { - if (!candidate.empty()) { - const auto hit = testCandidate(candidate); - if (!hit.empty()) return hit; - } - return std::string(); - }; - - auto testRunfile = [&](const std::string& runfileRoot) -> std::string { - const std::string workspaceRoot = runfileRoot + "/src/dataflow/runner/dashboard"; - const std::string mainAliasRoot = runfileRoot + "/__main__/src/dataflow/runner/dashboard"; - const std::string workspace = getenv("TEST_WORKSPACE") ? getenv("TEST_WORKSPACE") : "cpp-dataflow-distributed-engine"; - const std::string wsAliasRoot = runfileRoot + "/" + workspace + "/src/dataflow/runner/dashboard"; - const auto hit = testCandidate(workspaceRoot); - if (!hit.empty()) return hit; - const auto hit2 = testCandidate(mainAliasRoot); - if (!hit2.empty()) return hit2; - return testCandidate(wsAliasRoot); - }; - - std::string source_file(__FILE__); - const std::size_t sep = source_file.find_last_of("/\\"); - if (sep != std::string::npos) { - const std::string candidate = source_file.substr(0, sep) + "/dashboard"; - const std::string hit = testCandidate(candidate); - if (!hit.empty()) return hit; - } - - if (const char* workspace = std::getenv("BUILD_WORKSPACE_DIRECTORY")) { - const std::string hit = tryCandidate(appendDash(std::string(workspace))); - if (!hit.empty()) return hit; - const std::string hit_bin = tryCandidate(std::string(workspace) + "/bazel-bin/src/dataflow/runner/dashboard"); - if (!hit_bin.empty()) return hit_bin; - const std::string hit_bin2 = tryCandidate(std::string(workspace) + "/bazel-out/bin/src/dataflow/runner/dashboard"); - if (!hit_bin2.empty()) return hit_bin2; - } - - if (const char* test_srcdir = std::getenv("TEST_SRCDIR")) { - const std::string hit = tryCandidate(appendDash(std::string(test_srcdir))); - if (!hit.empty()) return hit; - const std::string hit_workspace = tryCandidate(std::string(test_srcdir) + "/cpp-dataflow-distributed-engine/src/dataflow/runner/dashboard"); - if (!hit_workspace.empty()) return hit_workspace; - } - - if (const char* runfiles = std::getenv("RUNFILES_DIR")) { - const std::string runfiles_root = runfiles; - const std::string hit = testRunfile(runfiles_root); - if (!hit.empty()) return hit; - } - - if (const char* test_tmpdir = std::getenv("TEST_TMPDIR")) { - const std::string base = test_tmpdir; - const auto slash = base.find_last_of("/\\"); - if (slash != std::string::npos) { - const std::string root = base.substr(0, slash); - const std::string hit = tryCandidate(appendDash(root)); - if (!hit.empty()) return hit; - } - } - - const std::string cwd = std::filesystem::current_path().string(); - std::filesystem::path path = std::filesystem::path(cwd); - for (int i = 0; i < 10; ++i) { - const std::string candidate = appendDash(path.string()); - const std::string hit = testCandidate(candidate); - if (!hit.empty()) return hit; - if (!path.has_parent_path()) break; - path = path.parent_path(); - } - - return appendDash("."); - }(); - return root; -} - -std::string dashboardMimeType(const std::string& path) { - if (path.size() >= 5 && path.compare(path.size() - 5, 5, ".html") == 0) return "text/html"; - if (path.size() >= 3 && path.compare(path.size() - 3, 3, ".js") == 0) return "text/javascript"; - if (path.size() >= 3 && path.compare(path.size() - 3, 3, ".ts") == 0) return "text/plain"; - if (path.size() >= 4 && path.compare(path.size() - 4, 4, ".css") == 0) return "text/css"; - return "text/plain"; -} - -std::string dashboardReadFile(const std::string& relative_path, bool* ok) { - std::string full_path = dashboardRoot() + relative_path; - std::ifstream file(full_path, std::ios::binary); - if (!file.is_open()) { - if (ok) *ok = false; - return {}; - } - if (ok) *ok = true; - std::ostringstream out; - out << file.rdbuf(); - return out.str(); -} - struct LocalWorkerProcess { pid_t pid; std::string node_id; bool registered; }; -void serveDashboardAsset(int fd, const std::string& raw_path) { - std::string path = raw_path; - const std::size_t qmark = path.find('?'); - if (qmark != std::string::npos) path = path.substr(0, qmark); - if (path.empty() || path == "/") path = "/index.html"; - if (path.front() != '/') path = "/" + path; - - if (path.find("..") != std::string::npos) { - sendHttpResponse(fd, 403, "Forbidden", R"json({"ok":false,"message":"invalid file path"})json", - "application/json"); - return; - } - - bool ok = false; - const std::string content = dashboardReadFile(path, &ok); - if (!ok) { - sendHttpResponse(fd, 404, "Not Found", R"json({"ok":false,"message":"not found"})json", - "application/json"); - return; - } - sendHttpResponse(fd, 200, "OK", content, dashboardMimeType(path)); -} - struct RpcTaskSnapshot { std::string task_id; std::string state = "SUBMITTED"; @@ -494,27 +221,6 @@ std::string snapshotToJson(const RpcJobSnapshot& snapshot) { }); } -std::string buildJobsJson(const std::unordered_map& job_snapshots) { - std::vector jobs; - std::vector job_ids; - job_ids.reserve(job_snapshots.size()); - for (const auto& item : job_snapshots) { - job_ids.push_back(item.first); - } - std::sort(job_ids.begin(), job_ids.end()); - for (const auto& job_id : job_ids) { - const auto it = job_snapshots.find(job_id); - if (it != job_snapshots.end()) { - jobs.push_back(snapshotToJson(it->second)); - } - } - using namespace observability; - return object({ - field("count", static_cast(jobs.size())), - field("jobs", array(jobs), true), - }); -} - RpcFrame makeFrameFromMessage(uint64_t msg_id, uint64_t correlation_id, const std::string& source, @@ -623,22 +329,6 @@ int runActorScheduler(const ActorRuntimeConfig& config) { std::cout << "[scheduler] listen " << config.listen_address << "\n"; - std::string dashboard_host; - uint16_t dashboard_port = 0; - int dashboard_server_fd = -1; - if (config.dashboard_enabled) { - if (!parseConfigEndpoint(config.dashboard_listen_address, &dashboard_host, &dashboard_port)) { - std::cerr << "Invalid --dashboard-listen endpoint: " << config.dashboard_listen_address << "\n"; - return 1; - } - dashboard_server_fd = createServerSocket(dashboard_host, dashboard_port); - if (dashboard_server_fd < 0) { - std::cerr << "scheduler failed to listen dashboard on " << config.dashboard_listen_address << "\n"; - return 1; - } - std::cout << "[dashboard] listen " << config.dashboard_listen_address << "\n"; - } - LengthPrefixedFrameCodec codec; std::vector conns; std::unordered_map conn_role; @@ -650,7 +340,6 @@ int runActorScheduler(const ActorRuntimeConfig& config) { std::unordered_map task_to_worker; std::unordered_map pending_worker_result_msgs; std::unordered_map job_snapshots; - std::vector dashboard_conns; uint64_t next_message_id = 1; std::unordered_set local_worker_nodes; int launching_local_workers = 0; @@ -778,8 +467,6 @@ int runActorScheduler(const ActorRuntimeConfig& config) { worker_config.node_id = config.node_id + "-worker-" + std::to_string(worker_index); worker_config.connect_address = config.listen_address; worker_config.single_node = false; - worker_config.dashboard_enabled = false; - const pid_t pid = fork(); if (pid < 0) { emitRpcEvent("actor_scheduler", "local_worker_spawn_failed", config.node_id, @@ -820,140 +507,6 @@ int runActorScheduler(const ActorRuntimeConfig& config) { {observability::field("snapshot", snapshotToJson(snapshot), true)}); }; - auto submitDashboardJob = [&](const std::string& payload_input, - const std::string& sql, - std::string* job_id_out, - std::string* summary_out) -> bool { - if (workers.empty()) { - ensureLocalWorkers(); - if (!config.auto_worker && workers.empty()) { - if (job_id_out) { - job_id_out->clear(); - } - if (summary_out) { - summary_out->assign("scheduler reject: no worker available"); - } - emitRpcEvent("actor_scheduler", "dashboard_submit", config.node_id, - "JOB_SUBMIT_REJECTED", "", - {observability::field("reason", "no worker available")}); - return false; - } - if (workers.empty() && launching_local_workers > 0 && summary_out) { - summary_out->assign("scheduler started local worker, job submitted to remote execution queue"); - } - } - - DataFrame plan_df; - try { - plan_df = buildClientPlan(payload_input, sql); - } catch (const std::exception& e) { - if (job_id_out) job_id_out->clear(); - if (summary_out) summary_out->assign(e.what()); - return false; - } - - const auto handle = JobMaster::instance().submitRemote(plan_df.serializePlan()); - const std::string job_id = handle.id(); - - RpcJobSnapshot snapshot; - snapshot.job_id = job_id; - snapshot.client_node = "dashboard"; - snapshot.sql = sql; - snapshot.payload = flattenText(sql.empty() ? plan_df.explain() : sql); - snapshot.state = "SUBMITTED"; - snapshot.status_code = "JOB_SUBMITTED"; - job_snapshots[job_id] = snapshot; - - if (job_id_out) *job_id_out = job_id; - if (summary_out) *summary_out = snapshot.payload; - emitSnapshotEvent(snapshot); - return true; - }; - - auto handleDashboardConnection = [&](int fd) { - std::string method; - std::string path; - std::string body; - if (!readHttpRequest(fd, &method, &path, &body)) { - sendHttpResponse(fd, 400, "Bad Request", R"json({"ok":false,"message":"invalid request"})json", - "application/json"); - return; - } - - const std::size_t qmark = path.find('?'); - if (qmark != std::string::npos) path = path.substr(0, qmark); - - if (method == "GET" && (path == "/" || path == "/index.html" || path == "/app.js" || path == "/app.ts")) { - serveDashboardAsset(fd, path); - return; - } - - if (method == "GET" && path.rfind("/api/", 0) != 0 && path.find('.') == std::string::npos) { - serveDashboardAsset(fd, "/index.html"); - return; - } - - if (method == "GET" && path == "/api/jobs") { - sendHttpResponse(fd, 200, "OK", buildJobsJson(job_snapshots), "application/json"); - return; - } - - if (method == "GET" && path.compare(0, 10, "/api/jobs/") == 0 && path.size() > 10) { - const std::string job_id = path.substr(10); - const auto it = job_snapshots.find(job_id); - if (it == job_snapshots.end()) { - sendHttpResponse(fd, 404, "Not Found", R"json({"ok":false,"message":"job not found"})json", - "application/json"); - } else { - sendHttpResponse(fd, 200, "OK", snapshotToJson(it->second), "application/json"); - } - return; - } - - if (method == "POST" && path == "/api/jobs") { - const auto fields = parseFormBody(body); - const std::string payload_input = fields.count("payload") ? fields.at("payload") : ""; - const std::string sql = fields.count("sql") ? fields.at("sql") : ""; - if (payload_input.empty() && sql.empty()) { - sendHttpResponse(fd, 400, "Bad Request", R"json({"ok":false,"message":"payload or sql required"})json", - "application/json"); - return; - } - - std::string job_id; - std::string summary; - if (!submitDashboardJob(payload_input, sql, &job_id, &summary)) { - const bool unavailable_worker = summary.find("no worker available") != std::string::npos; - const std::string detail = observability::object({ - observability::field("ok", false), - observability::field("message", summary.empty() ? "submit failed" : summary), - }); - sendHttpResponse(fd, unavailable_worker ? 503 : 500, - unavailable_worker ? "Service Unavailable" : "Internal Server Error", - detail, "application/json"); - return; - } - - const std::string detail = observability::object({ - observability::field("ok", true), - observability::field("job_id", job_id), - observability::field("state", job_snapshots[job_id].state), - observability::field("status_code", job_snapshots[job_id].status_code), - observability::field("payload", summary), - observability::field("result_payload", job_snapshots[job_id].result_payload), - }); - sendHttpResponse(fd, 200, "OK", detail, "application/json"); - emitRpcEvent("actor_scheduler", "dashboard_submit", config.node_id, "JOB_SUBMITTED", job_id, - {observability::field("job_id", job_id), - observability::field("client_node", "dashboard"), - observability::field("payload", summary)}); - return; - } - - sendHttpResponse(fd, 404, "Not Found", R"json({"ok":false,"message":"not found"})json", - "application/json"); - }; - auto sendTo = [&](int fd, const ActorRpcMessage& message) { const RpcFrame frame = makeFrameFromMessage(next_message_id++, config.node_id, "peer", message); return sendFrameOverSocket(fd, codec, frame); @@ -1100,20 +653,11 @@ int runActorScheduler(const ActorRuntimeConfig& config) { FD_ZERO(&reads); FD_SET(server_fd, &reads); int max_fd = server_fd; - if (dashboard_server_fd >= 0) { - FD_SET(dashboard_server_fd, &reads); - if (dashboard_server_fd > max_fd) max_fd = dashboard_server_fd; - } for (int fd : conns) { FD_SET(fd, &reads); if (fd > max_fd) max_fd = fd; } - for (int fd : dashboard_conns) { - FD_SET(fd, &reads); - if (fd > max_fd) max_fd = fd; - } - struct timeval tick; tick.tv_sec = 0; tick.tv_usec = 200 * 1000; @@ -1134,13 +678,6 @@ int runActorScheduler(const ActorRuntimeConfig& config) { } } - if (dashboard_server_fd >= 0 && FD_ISSET(dashboard_server_fd, &reads)) { - const int conn_fd = accept(dashboard_server_fd, nullptr, nullptr); - if (conn_fd >= 0) { - dashboard_conns.push_back(conn_fd); - } - } - for (std::size_t i = 0; i < conns.size();) { const int fd = conns[i]; if (!FD_ISSET(fd, &reads)) { @@ -1419,27 +956,11 @@ int runActorScheduler(const ActorRuntimeConfig& config) { ++i; } - for (std::size_t i = 0; i < dashboard_conns.size();) { - const int fd = dashboard_conns[i]; - if (!FD_ISSET(fd, &reads)) { - ++i; - continue; - } - handleDashboardConnection(fd); - dashboard_conns.erase(std::remove(dashboard_conns.begin(), dashboard_conns.end(), fd), dashboard_conns.end()); - ::close(fd); - } } for (const int fd : conns) { close(fd); } - for (const int fd : dashboard_conns) { - ::close(fd); - } - if (dashboard_server_fd >= 0) { - ::close(dashboard_server_fd); - } ::close(server_fd); stopAllLocalWorkers(); ensureSignalRestored(); diff --git a/src/dataflow/runner/actor_runtime.h b/src/dataflow/runner/actor_runtime.h index b7c0ec3..4d36487 100644 --- a/src/dataflow/runner/actor_runtime.h +++ b/src/dataflow/runner/actor_runtime.h @@ -17,10 +17,8 @@ struct ActorRuntimeConfig { std::string node_id = "node"; std::string connect_address = "127.0.0.1:61000"; std::string listen_address = "127.0.0.1:61000"; - std::string dashboard_listen_address = "127.0.0.1:8080"; bool auto_worker = true; int local_worker_count = 2; - bool dashboard_enabled = false; bool single_node = false; bool print_help = false; }; diff --git a/src/dataflow/runner/dashboard/app.ts b/src/dataflow/runner/dashboard/app.ts deleted file mode 100644 index c2d8a3a..0000000 --- a/src/dataflow/runner/dashboard/app.ts +++ /dev/null @@ -1,445 +0,0 @@ -// @ts-nocheck -declare const React: any; -declare const ReactDOM: any; - -interface JobSnapshot { - job_id: string; - state: string; - status_code: string; - client_node: string; - worker_node: string; - payload: string; - sql: string; - result_payload: string; - chain: { - chain_id: string; - state: string; - status_code: string; - task_ids: string[]; - }; - task: { - task_id: string; - state: string; - status_code: string; - worker_id: string; - }; -} - -interface JobListPayload { - count: number; - jobs: JobSnapshot[]; -} - -interface SubmitResult { - ok: boolean; - job_id: string; - state?: string; - status_code?: string; - payload?: string; - message?: string; -} - -const { useEffect, useCallback, useState, createElement: h } = React; - -const sqlPresets = [ - { - name: '默认聚合', - sql: 'SELECT token, SUM(score) AS total_score FROM rpc_input GROUP BY token', - }, - { - name: '按 SQL 过滤', - sql: 'SELECT token, SUM(score) AS total_score FROM rpc_input WHERE score > 0 GROUP BY token', - }, - { - name: '创建表', - sql: 'CREATE TABLE users (id,name,score)', - }, - { - name: '插入一条', - sql: 'INSERT INTO users VALUES (1, "alice", 10)', - }, - { - name: '按列插入', - sql: 'INSERT INTO users (id, name, score) VALUES (2, "bob", 20)', - }, -]; - -interface ComplexSqlStep { - label: string; - payload: string; - sql: string; -} - -const complexSqlDemos: Array<{ name: string; steps: ComplexSqlStep[] }> = [ - { - name: '复杂分析 Demo(本地 DDL/DML + 最终查询)', - steps: [ - { - label: 'Step 1: 创建用户明细表', - payload: '', - sql: "CREATE TABLE app_users (user_id INT, token STRING, score INT, region STRING)", - }, - { - label: 'Step 2: 写入用户明细', - payload: '', - sql: "INSERT INTO app_users VALUES (1, 'alice', 25, 'apac'), (2, 'bob', 18, 'emea'), (3, 'claire', 34, 'na'), (4, 'david', 11, 'apac'), (5, 'ella', 7, 'na')", - }, - { - label: 'Step 3: 创建交易事实表', - payload: '', - sql: "CREATE TABLE app_actions (user_id INT, action STRING, score INT)", - }, - { - label: 'Step 4: 写入交易事实', - payload: '', - sql: "INSERT INTO app_actions VALUES (1, 'view', 5), (1, 'purchase', 20), (2, 'view', 12), (2, 'click', 6), (3, 'purchase', 30), (4, 'view', 4), (5, 'click', 11)", - }, - { - label: 'Step 5: 基于 join 的分组计算', - payload: '', - sql: "CREATE TABLE app_region_summary (region STRING, total_score INT, user_count INT)", - }, - { - label: 'Step 6: 写入分组结果', - payload: '', - sql: "INSERT INTO app_region_summary SELECT u.region AS region, SUM(a.score) AS total_score, COUNT(*) AS user_count FROM app_users AS u INNER JOIN app_actions AS a ON u.user_id = a.user_id WHERE a.score > 6 GROUP BY u.region HAVING SUM(a.score) > 15", - }, - { - label: 'Step 7: 查看汇总结果', - payload: '', - sql: "SELECT region, total_score, user_count FROM app_region_summary WHERE total_score > 10 LIMIT 5", - }, - ], - }, -]; - -const api = { - async listJobs(): Promise { - const response = await fetch('/api/jobs'); - if (!response.ok) { - throw new Error('列表查询失败'); - } - return (await response.json()) as JobListPayload; - }, - async getJob(jobId: string): Promise { - const response = await fetch(`/api/jobs/${jobId}`); - if (!response.ok) { - throw new Error('详情查询失败'); - } - return (await response.json()) as JobSnapshot; - }, - async submit(payload: string, sql: string): Promise { - const params = new URLSearchParams(); - params.set('payload', payload); - params.set('sql', sql); - const response = await fetch('/api/jobs', { - method: 'POST', - headers: { 'Content-Type': 'application/x-www-form-urlencoded' }, - body: params.toString(), - }); - const body = (await response.json()) as SubmitResult & { message?: string }; - if (!response.ok || !body.ok) { - throw new Error(body.message || `提交失败: ${response.status}`); - } - return body; - }, -}; - -function stateClassName(state: string): string { - return state === 'FINISHED' || state === 'SUCCEEDED' || state === 'JOB_FINISHED' - ? 'ok' - : 'warn'; -} - -function shorten(value: string | undefined, maxLength: number): string { - if (!value) return '-'; - if (value.length <= maxLength) return value; - return `${value.slice(0, maxLength)}...`; -} - -function JobList({ - jobs, - onSelect, -}: { - jobs: JobSnapshot[]; - onSelect: (jobId: string) => void; -}) { - return h( - 'div', - { className: 'grid' }, - h( - 'table', - null, - h( - 'thead', - null, - h( - 'tr', - null, - h('th', null, 'JobId'), - h('th', null, 'State'), - h('th', null, 'SQL'), - h('th', null, 'Worker'), - h('th', null, '链路/任务'), - h('th', null, '结果摘要') - ) - ), - h( - 'tbody', - null, - ...jobs.map((job) => - h( - 'tr', - { key: job.job_id }, - h( - 'td', - null, - h( - 'a', - { - href: '#', - onClick: (event: Event) => { - event.preventDefault(); - onSelect(job.job_id); - }, - }, - job.job_id - ) - ), - h( - 'td', - null, - h('div', { className: stateClassName(job.state) }, job.state), - h('div', { className: 'tiny' }, job.status_code) - ), - h('td', null, shorten(job.sql || job.payload, 52)), - h('td', null, job.worker_node || '-'), - h( - 'td', - null, - h('div', null, job.chain.chain_id || '-'), - h('div', { className: 'tiny' }, `${job.task.task_id || '-'} / ${job.chain.task_ids.length} tasks`) - ), - h('td', null, shorten(job.result_payload || '-', 80)) - ) - ) - ) - ) - ); -} - -function App() { - const [jobs, setJobs] = useState([]); - const [payload, setPayload] = useState('demo payload'); - const [sql, setSql] = useState('SELECT token, SUM(score) AS total_score FROM rpc_input GROUP BY token'); - const [statusText, setStatusText] = useState('ready'); - const [hintText, setHintText] = useState(''); - const [detailText, setDetailText] = useState(''); - const [selectedJobId, setSelectedJobId] = useState(''); - const [submitting, setSubmitting] = useState(false); - - const refresh = useCallback(async () => { - const data = await api.listJobs(); - setJobs(Array.isArray(data.jobs) ? data.jobs : []); - setHintText(`更新于 ${new Date().toLocaleTimeString()},共 ${data.count} 条`); - }, []); - - useEffect(() => { - refresh().catch((error: unknown) => { - setHintText(error instanceof Error ? error.message : '刷新失败'); - }); - - const timer = window.setInterval(() => { - refresh().catch(() => {}); - }, 1000); - return () => { - window.clearInterval(timer); - }; - }, [refresh]); - - const submit = async () => { - setSubmitting(true); - setStatusText('提交中...'); - try { - if (!payload && !sql) { - setStatusText('payload 或 sql 至少填写一个'); - return; - } - const result = await api.submit(payload, sql); - const stateText = result.state || 'SUBMITTED'; - const statusCode = result.status_code || 'JOB_SUBMITTED'; - setStatusText(`任务已提交:${result.job_id}(${stateText}/${statusCode})`); - await refresh(); - const detail = await api.getJob(result.job_id); - setSelectedJobId(result.job_id); - setDetailText(JSON.stringify(detail, null, 2)); - } catch (error: unknown) { - setStatusText(error instanceof Error ? error.message : String(error)); - } finally { - setSubmitting(false); - } - }; - - const showDetail = async (jobId: string) => { - try { - const detail = await api.getJob(jobId); - setSelectedJobId(jobId); - setDetailText(JSON.stringify(detail, null, 2)); - } catch (error: unknown) { - setDetailText(error instanceof Error ? error.message : String(error)); - } - }; - - const applyPreset = (nextSql: string) => { - setSql(nextSql); - setStatusText(`已加载 SQL 模板`); - }; - - const applyComplexDemo = async (demoName: string, steps: ComplexSqlStep[]) => { - setSubmitting(true); - setStatusText(`开始执行 ${demoName}`); - setDetailText(''); - const logs: string[] = []; - let lastJobId = ''; - let finalDetail = ''; - try { - for (let i = 0; i < steps.length; i += 1) { - const step = steps[i]; - setStatusText(`执行 ${demoName}:${step.label}`); - const result = await api.submit(step.payload, step.sql); - lastJobId = result.job_id; - const stateText = result.state || 'SUBMITTED'; - const statusCode = result.status_code || 'JOB_SUBMITTED'; - logs.push(`${step.label}: ${result.job_id} -> ${stateText}/${statusCode}`); - await refresh(); - if (i === steps.length - 1) { - const detail = await api.getJob(result.job_id); - setSelectedJobId(result.job_id); - finalDetail = JSON.stringify(detail, null, 2); - } - } - setStatusText(`复杂 Demo 已提交:${lastJobId}`); - if (finalDetail) { - setDetailText(finalDetail); - } else { - setDetailText(logs.join('\n')); - } - } catch (error: unknown) { - const message = error instanceof Error ? error.message : String(error); - logs.push(`失败: ${message}`); - setStatusText(message); - } finally { - if (!finalDetail && logs.length > 0) { - setDetailText(logs.join('\n')); - } - setSubmitting(false); - } - }; - - return h( - 'div', - { className: 'panel-grid' }, - h( - 'div', - { className: 'panel' }, - h('h3', null, '提交任务'), - h('div', { className: 'row', style: { marginBottom: '10px' } }, - h('label', { className: 'tiny', style: { display: 'block', width: '100%' } }, 'Payload(用于 rpc_input / 调试)'), - h('input', { - className: 'small', - value: payload, - onInput: (event: InputEvent) => { - const target = event.target as HTMLInputElement | null; - if (target) setPayload(target.value); - }, - placeholder: 'demo payload' - }) - ), - h('div', { style: { marginBottom: '10px' } }, - h('label', { htmlFor: 'sqlInput' }, 'SQL(可选)'), - h('textarea', { - id: 'sqlInput', - value: sql, - onInput: (event: InputEvent) => { - const target = event.target as HTMLTextAreaElement | null; - if (target) setSql(target.value); - }, - placeholder: 'SELECT token, SUM(score) AS total_score FROM rpc_input GROUP BY token' - }), - h('div', { className: 'tiny', style: { marginTop: '4px' } }, '支持 CREATE TABLE / INSERT / SELECT 常用语法') - ), - h( - 'div', - { className: 'row', style: { marginBottom: '10px' } }, - ...sqlPresets.map((item) => - h( - 'button', - { - type: 'button', - onClick: () => applyPreset(item.sql), - style: { minWidth: 'auto', paddingLeft: '12px', paddingRight: '12px' }, - }, - item.name - ) - ) - ), - h( - 'div', - { className: 'row', style: { marginBottom: '10px' } }, - ...complexSqlDemos.map((demo) => - h( - 'button', - { - type: 'button', - disabled: submitting, - onClick: () => applyComplexDemo(demo.name, demo.steps), - style: { minWidth: 'auto', paddingLeft: '12px', paddingRight: '12px' }, - }, - `复杂Demo:${demo.name}` - ) - ) - ), - h( - 'div', - { className: 'row' }, - h('button', { type: 'button', onClick: submit, disabled: submitting }, submitting ? '提交中…' : '提交任务'), - h('span', { className: 'muted', id: 'submitStatus' }, statusText) - ) - ), - h( - 'div', - { className: 'panel' }, - h('h3', null, '运行作业'), - h( - 'div', - { className: 'row', style: { marginBottom: '10px' } }, - h('button', { type: 'button', onClick: refresh }, '刷新'), - h('span', { className: 'muted', id: 'refreshHint' }, hintText) - ), - h( - 'div', - { className: 'muted', style: { marginBottom: '6px' } }, - `${jobs.length} 条作业(最近排序)` - ), - h(JobList, { jobs, onSelect: showDetail }) - ), - h( - 'div', - { className: 'panel' }, - h('h3', null, '作业详情'), - h('div', { className: 'muted', style: { marginBottom: '6px' } }, - selectedJobId ? `JobId: ${selectedJobId}` : '点击任务行查看详情,或提交后自动展开' - ), - h('pre', { className: 'code' }, detailText || '暂无详情') - ) - ); -} - -const root = document.getElementById('dashboard-root'); -if (root) { - if (typeof ReactDOM.createRoot === 'function') { - ReactDOM.createRoot(root).render(h(App)); - } else { - ReactDOM.render(h(App), root); - } -} diff --git a/src/dataflow/runner/dashboard/index.html b/src/dataflow/runner/dashboard/index.html deleted file mode 100644 index 235bb4b..0000000 --- a/src/dataflow/runner/dashboard/index.html +++ /dev/null @@ -1,189 +0,0 @@ - - - - - - Dataflow Runtime Dashboard - - - - - -
-
- -
- - diff --git a/src/dataflow/tests/stream_runtime_test.cc b/src/dataflow/tests/stream_runtime_test.cc index 8897f6a..332112b 100644 --- a/src/dataflow/tests/stream_runtime_test.cc +++ b/src/dataflow/tests/stream_runtime_test.cc @@ -80,6 +80,13 @@ void expect(bool condition, const std::string& message) { } } +void expectContains(const std::string& haystack, const std::string& needle, + const std::string& message) { + if (haystack.find(needle) == std::string::npos) { + throw std::runtime_error(message + ": missing " + needle); + } +} + void testBackpressure() { dataflow::DataflowSession& session = dataflow::DataflowSession::builder(); std::vector batches; @@ -290,6 +297,73 @@ void testWindowEviction() { "window eviction should keep newest window key"); } +void testSnapshotJsonContract() { + dataflow::DataflowSession& session = dataflow::DataflowSession::builder(); + auto sink = std::make_shared(); + + dataflow::Table batch; + batch.schema = dataflow::Schema({"ts", "key", "value"}); + batch.rows = { + {dataflow::Value("2026-03-28T12:00:00"), dataflow::Value("userA"), dataflow::Value(int64_t(1))}, + {dataflow::Value("2026-03-28T12:00:10"), dataflow::Value("userB"), dataflow::Value(int64_t(2))}, + }; + + dataflow::StreamingQueryOptions options; + options.trigger_interval_ms = 0; + options.checkpoint_delivery_mode = dataflow::CheckpointDeliveryMode::BestEffort; + + auto query = session.readStream(std::make_shared(std::vector{batch})) + .writeStream(sink, options); + query.start(); + expect(query.awaitTermination() == 1, "snapshot contract query should process one batch"); + + const std::string snapshot = query.snapshotJson(); + expectContains(snapshot, "\"query_id\":", "snapshot should expose query_id"); + expectContains(snapshot, "\"status\":", "snapshot should expose status"); + expectContains(snapshot, "\"requested_execution_mode\":", + "snapshot should expose requested_execution_mode"); + expectContains(snapshot, "\"execution_mode\":", "snapshot should expose execution_mode"); + expectContains(snapshot, "\"execution_reason\":", "snapshot should expose execution_reason"); + expectContains(snapshot, "\"transport_mode\":", "snapshot should expose transport_mode"); + expectContains(snapshot, "\"blocked_count\":", "snapshot should expose blocked_count"); + expectContains(snapshot, "\"max_backlog_batches\":", "snapshot should expose max_backlog_batches"); + expectContains(snapshot, "\"inflight_batches\":", "snapshot should expose inflight_batches"); + expectContains(snapshot, "\"inflight_partitions\":", "snapshot should expose inflight_partitions"); + expectContains(snapshot, "\"last_batch_latency_ms\":", "snapshot should expose last_batch_latency_ms"); + expectContains(snapshot, "\"last_sink_latency_ms\":", "snapshot should expose last_sink_latency_ms"); + expectContains(snapshot, "\"last_state_latency_ms\":", "snapshot should expose last_state_latency_ms"); + expectContains(snapshot, "\"last_source_offset\":", "snapshot should expose last_source_offset"); + expectContains(snapshot, "\"backpressure_active\":", "snapshot should expose backpressure_active"); + expectContains(snapshot, "\"actor_eligible\":", "snapshot should expose actor_eligible"); + expectContains(snapshot, "\"used_actor_runtime\":", "snapshot should expose used_actor_runtime"); + expectContains(snapshot, "\"used_shared_memory\":", "snapshot should expose used_shared_memory"); + expectContains(snapshot, "\"has_stateful_ops\":", "snapshot should expose has_stateful_ops"); + expectContains(snapshot, "\"has_window\":", "snapshot should expose has_window"); + expectContains(snapshot, "\"sink_is_blocking\":", "snapshot should expose sink_is_blocking"); + expectContains(snapshot, "\"source_is_bounded\":", "snapshot should expose source_is_bounded"); + expectContains(snapshot, "\"estimated_partitions\":", "snapshot should expose estimated_partitions"); + expectContains(snapshot, "\"projected_payload_bytes\":", "snapshot should expose projected_payload_bytes"); + expectContains(snapshot, "\"sampled_batches\":", "snapshot should expose sampled_batches"); + expectContains(snapshot, "\"sampled_rows_per_batch\":", + "snapshot should expose sampled_rows_per_batch"); + expectContains(snapshot, "\"average_projected_payload_bytes\":", + "snapshot should expose average_projected_payload_bytes"); + expectContains(snapshot, "\"actor_speedup\":", "snapshot should expose actor_speedup"); + expectContains(snapshot, "\"compute_to_overhead_ratio\":", + "snapshot should expose compute_to_overhead_ratio"); + expectContains(snapshot, "\"estimated_state_size_bytes\":", + "snapshot should expose estimated_state_size_bytes"); + expectContains(snapshot, "\"estimated_batch_cost\":", "snapshot should expose estimated_batch_cost"); + expectContains(snapshot, "\"backpressure_max_queue_batches\":", + "snapshot should expose backpressure_max_queue_batches"); + expectContains(snapshot, "\"backpressure_high_watermark\":", + "snapshot should expose backpressure_high_watermark"); + expectContains(snapshot, "\"backpressure_low_watermark\":", + "snapshot should expose backpressure_low_watermark"); + expectContains(snapshot, "\"checkpoint_delivery_mode\":\"best-effort\"", + "snapshot should expose checkpoint_delivery_mode"); +} + } // namespace int main() { @@ -300,6 +374,7 @@ int main() { testCheckpointRestoreBestEffort(); testCheckpointRestoreDuplicatesSinkOutputAtLeastOnce(); testWindowEviction(); + testSnapshotJsonContract(); std::cout << "[test] stream runtime ok" << std::endl; return 0; } catch (const std::exception& ex) { diff --git a/src/dataflow/tests/vector_runtime_test.cc b/src/dataflow/tests/vector_runtime_test.cc index 666d39b..6773f7c 100644 --- a/src/dataflow/tests/vector_runtime_test.cc +++ b/src/dataflow/tests/vector_runtime_test.cc @@ -1,7 +1,9 @@ #include +#include #include #include #include +#include #include #include "src/dataflow/api/session.h" @@ -151,6 +153,9 @@ int main() { expect(explain.find("metric=cosine") != std::string::npos, "explain metric missing"); expect(explain.find("dimension=3") != std::string::npos, "explain dimension missing"); expect(explain.find("top_k=2") != std::string::npos, "explain top_k missing"); + expect(explain.find("candidate_rows=3") != std::string::npos, "explain candidate_rows missing"); + expect(explain.find("filter_pushdown=false") != std::string::npos, + "explain filter_pushdown contract missing"); expect(explain.find("acceleration=flat-buffer+heap-topk") != std::string::npos, "explain acceleration hint missing"); @@ -168,6 +173,47 @@ int main() { expect(sparse.rows.size() == 1, "sparse vector query top-k mismatch"); expect(sparse.rows[0][0].asInt64() == 2, "sparse vector query should preserve source row id"); + const auto parsed_space = dataflow::Value::parseFixedVector("[1 0 0]"); + expect(parsed_space.size() == 3, "space-separated vector parse should keep dimension"); + expect(parsed_space[0] == 1.0f && parsed_space[1] == 0.0f && parsed_space[2] == 0.0f, + "space-separated vector parse content mismatch"); + + const auto parsed_comma = dataflow::Value::parseFixedVector("[0.9,0.1,0]"); + expect(parsed_comma.size() == 3, "comma-separated vector parse should keep dimension"); + expect(parsed_comma[0] == 0.9f && parsed_comma[1] == 0.1f && parsed_comma[2] == 0.0f, + "comma-separated vector parse content mismatch"); + + char csv_path_template[] = "/tmp/velaria-vector-runtime-XXXXXX"; + const int csv_fd = mkstemp(csv_path_template); + expect(csv_fd != -1, "mkstemp failed for vector csv test"); + close(csv_fd); + const std::string csv_path = csv_path_template; + { + std::ofstream csv(csv_path); + expect(csv.good(), "failed to open vector csv temp file"); + csv << "id,embedding\n"; + csv << "1,[1 0 0]\n"; + csv << "2,[0.9 0.1 0]\n"; + csv << "3,[0 1 0]\n"; + } + + auto csv_df = session.read_csv(csv_path); + session.createTempView("vec_csv_src", csv_df); + const auto csv_cosine = session.vectorQuery("vec_csv_src", "embedding", {1.0f, 0.0f, 0.0f}, 2, + dataflow::VectorDistanceMetric::Cosine) + .toTable(); + std::remove(csv_path.c_str()); + expect(csv_cosine.rows.size() == 2, "csv vector query top-k mismatch"); + expect(csv_cosine.rows[0][0].asInt64() == 0, "csv vector query nearest should be row 0"); + + const auto csv_explain = session.explainVectorQuery("vec_csv_src", "embedding", + {1.0f, 0.0f, 0.0f}, 2, + dataflow::VectorDistanceMetric::Cosine); + expect(csv_explain.find("mode=exact-scan") != std::string::npos, + "csv explain mode missing"); + expect(csv_explain.find("candidate_rows=3") != std::string::npos, + "csv explain candidate_rows missing"); + std::cout << "[test] vector runtime query and transport ok" << std::endl; return 0; } catch (const std::exception& ex) { From acdb79528686622b784b4f52fec99756290edaa3 Mon Sep 17 00:00:00 2001 From: zuolingxuan Date: Mon, 30 Mar 2026 17:01:25 +0800 Subject: [PATCH 2/3] Physically reorganize source layers --- BUILD.bazel | 216 +++++++++--------- docs/core-boundary.md | 23 +- .../{ => core/contract}/api/dataframe.cc | 4 +- .../{ => core/contract}/api/dataframe.h | 8 +- .../{ => core/contract}/api/session.cc | 4 +- .../{ => core/contract}/api/session.h | 10 +- .../{ => core/contract}/catalog/catalog.cc | 2 +- .../{ => core/contract}/catalog/catalog.h | 8 +- .../contract}/source_sink_abi.h | 2 +- src/dataflow/core/{ => execution}/csv.cc | 2 +- src/dataflow/core/{ => execution}/csv.h | 2 +- .../{ => core/execution}/runtime/executor.cc | 4 +- .../{ => core/execution}/runtime/executor.h | 2 +- .../execution}/runtime/observability.h | 0 .../execution}/runtime/vector_index.cc | 2 +- .../execution}/runtime/vector_index.h | 0 .../{ => core/execution}/serial/serializer.cc | 2 +- .../{ => core/execution}/serial/serializer.h | 2 +- .../execution}/stream/binary_row_batch.cc | 2 +- .../execution}/stream/binary_row_batch.h | 2 +- .../{ => core/execution}/stream/stream.cc | 8 +- .../{ => core/execution}/stream/stream.h | 4 +- src/dataflow/core/{ => execution}/table.cc | 2 +- src/dataflow/core/{ => execution}/table.h | 2 +- src/dataflow/core/{ => execution}/value.cc | 2 +- src/dataflow/core/{ => execution}/value.h | 0 .../{ => core/logical}/planner/plan.cc | 4 +- .../{ => core/logical}/planner/plan.h | 2 +- src/dataflow/{ => core/logical}/sql/sql_ast.h | 4 +- .../{ => core/logical}/sql/sql_errors.h | 0 .../{ => core/logical}/sql/sql_parser.cc | 2 +- .../{ => core/logical}/sql/sql_parser.h | 4 +- .../{ => core/logical}/sql/sql_planner.cc | 2 +- .../{ => core/logical}/sql/sql_planner.h | 8 +- src/dataflow/examples/actor_rpc_client.cc | 2 +- src/dataflow/examples/actor_rpc_scheduler.cc | 2 +- src/dataflow/examples/actor_rpc_smoke.cc | 2 +- src/dataflow/examples/actor_rpc_worker.cc | 2 +- src/dataflow/examples/dataframe_demo.cc | 4 +- src/dataflow/examples/sql_demo.cc | 4 +- .../examples/stream_actor_benchmark.cc | 4 +- src/dataflow/examples/stream_benchmark.cc | 6 +- src/dataflow/examples/stream_demo.cc | 4 +- src/dataflow/examples/stream_sql_demo.cc | 4 +- .../examples/stream_state_container_demo.cc | 2 +- src/dataflow/examples/stream_stateful_demo.cc | 4 +- .../examples/tpch_q1_style_benchmark.cc | 2 +- .../examples/vector_search_benchmark.cc | 8 +- src/dataflow/examples/velaria_cli.cc | 4 +- src/dataflow/examples/wordcount.cc | 2 +- .../{ => experimental}/rpc/actor_rpc_codec.cc | 4 +- .../{ => experimental}/rpc/actor_rpc_codec.h | 2 +- .../{ => experimental}/rpc/rpc_codec.cc | 4 +- .../{ => experimental}/rpc/rpc_codec.h | 2 +- .../{ => experimental}/rpc/serialization.cc | 4 +- .../{ => experimental}/rpc/serialization.h | 2 +- .../runner/actor_runtime.cc | 20 +- .../{ => experimental}/runner/actor_runtime.h | 0 .../runtime/actor_runtime.cc | 2 +- .../runtime/actor_runtime.h | 6 +- .../runtime/byte_transport.cc | 2 +- .../runtime/byte_transport.h | 0 .../{ => experimental}/runtime/job_master.cc | 4 +- .../{ => experimental}/runtime/job_master.h | 6 +- .../{ => experimental}/runtime/rpc_contract.h | 4 +- .../{ => experimental}/runtime/rpc_runner.cc | 2 +- .../{ => experimental}/runtime/rpc_runner.h | 8 +- .../stream/actor_stream_runtime.cc | 8 +- .../stream/actor_stream_runtime.h | 2 +- .../transport/ipc_transport.cc | 4 +- .../transport/ipc_transport.h | 2 +- .../{ => interop}/python/python_module.cc | 10 +- src/dataflow/tests/planner_v03_test.cc | 4 +- src/dataflow/tests/source_sink_abi_test.cc | 6 +- src/dataflow/tests/sql_regression_test.cc | 8 +- .../tests/stream_actor_credit_test.cc | 2 +- src/dataflow/tests/stream_runtime_test.cc | 4 +- .../tests/stream_strategy_explain_test.cc | 4 +- src/dataflow/tests/vector_runtime_test.cc | 10 +- 79 files changed, 270 insertions(+), 263 deletions(-) rename src/dataflow/{ => core/contract}/api/dataframe.cc (99%) rename src/dataflow/{ => core/contract}/api/dataframe.h (93%) rename src/dataflow/{ => core/contract}/api/session.cc (99%) rename src/dataflow/{ => core/contract}/api/session.h (88%) rename src/dataflow/{ => core/contract}/catalog/catalog.cc (97%) rename src/dataflow/{ => core/contract}/catalog/catalog.h (82%) rename src/dataflow/{stream => core/contract}/source_sink_abi.h (97%) rename src/dataflow/core/{ => execution}/csv.cc (98%) rename src/dataflow/core/{ => execution}/csv.h (81%) rename src/dataflow/{ => core/execution}/runtime/executor.cc (99%) rename src/dataflow/{ => core/execution}/runtime/executor.h (94%) rename src/dataflow/{ => core/execution}/runtime/observability.h (100%) rename src/dataflow/{ => core/execution}/runtime/vector_index.cc (99%) rename src/dataflow/{ => core/execution}/runtime/vector_index.h (100%) rename src/dataflow/{ => core/execution}/serial/serializer.cc (99%) rename src/dataflow/{ => core/execution}/serial/serializer.h (95%) rename src/dataflow/{ => core/execution}/stream/binary_row_batch.cc (99%) rename src/dataflow/{ => core/execution}/stream/binary_row_batch.h (99%) rename src/dataflow/{ => core/execution}/stream/stream.cc (99%) rename src/dataflow/{ => core/execution}/stream/stream.h (99%) rename src/dataflow/core/{ => execution}/table.cc (91%) rename src/dataflow/core/{ => execution}/table.h (93%) rename src/dataflow/core/{ => execution}/value.cc (59%) rename src/dataflow/core/{ => execution}/value.h (100%) rename src/dataflow/{ => core/logical}/planner/plan.cc (99%) rename src/dataflow/{ => core/logical}/planner/plan.h (98%) rename src/dataflow/{ => core/logical}/sql/sql_ast.h (95%) rename src/dataflow/{ => core/logical}/sql/sql_errors.h (100%) rename src/dataflow/{ => core/logical}/sql/sql_parser.cc (99%) rename src/dataflow/{ => core/logical}/sql/sql_parser.h (67%) rename src/dataflow/{ => core/logical}/sql/sql_planner.cc (99%) rename src/dataflow/{ => core/logical}/sql/sql_planner.h (93%) rename src/dataflow/{ => experimental}/rpc/actor_rpc_codec.cc (96%) rename src/dataflow/{ => experimental}/rpc/actor_rpc_codec.h (96%) rename src/dataflow/{ => experimental}/rpc/rpc_codec.cc (98%) rename src/dataflow/{ => experimental}/rpc/rpc_codec.h (98%) rename src/dataflow/{ => experimental}/rpc/serialization.cc (98%) rename src/dataflow/{ => experimental}/rpc/serialization.h (91%) rename src/dataflow/{ => experimental}/runner/actor_runtime.cc (98%) rename src/dataflow/{ => experimental}/runner/actor_runtime.h (100%) rename src/dataflow/{ => experimental}/runtime/actor_runtime.cc (99%) rename src/dataflow/{ => experimental}/runtime/actor_runtime.h (97%) rename src/dataflow/{ => experimental}/runtime/byte_transport.cc (97%) rename src/dataflow/{ => experimental}/runtime/byte_transport.h (100%) rename src/dataflow/{ => experimental}/runtime/job_master.cc (99%) rename src/dataflow/{ => experimental}/runtime/job_master.h (97%) rename src/dataflow/{ => experimental}/runtime/rpc_contract.h (98%) rename src/dataflow/{ => experimental}/runtime/rpc_runner.cc (99%) rename src/dataflow/{ => experimental}/runtime/rpc_runner.h (93%) rename src/dataflow/{ => experimental}/stream/actor_stream_runtime.cc (99%) rename src/dataflow/{ => experimental}/stream/actor_stream_runtime.h (98%) rename src/dataflow/{ => experimental}/transport/ipc_transport.cc (97%) rename src/dataflow/{ => experimental}/transport/ipc_transport.h (95%) rename src/dataflow/{ => interop}/python/python_module.cc (99%) diff --git a/BUILD.bazel b/BUILD.bazel index 9b9418c..66db720 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -3,14 +3,14 @@ load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library", "cc_test") filegroup( name = "velaria_core_logical_sources", srcs = [ - "src/dataflow/planner/plan.cc", - "src/dataflow/planner/plan.h", - "src/dataflow/sql/sql_ast.h", - "src/dataflow/sql/sql_errors.h", - "src/dataflow/sql/sql_parser.cc", - "src/dataflow/sql/sql_parser.h", - "src/dataflow/sql/sql_planner.cc", - "src/dataflow/sql/sql_planner.h", + "src/dataflow/core/logical/planner/plan.cc", + "src/dataflow/core/logical/planner/plan.h", + "src/dataflow/core/logical/sql/sql_ast.h", + "src/dataflow/core/logical/sql/sql_errors.h", + "src/dataflow/core/logical/sql/sql_parser.cc", + "src/dataflow/core/logical/sql/sql_parser.h", + "src/dataflow/core/logical/sql/sql_planner.cc", + "src/dataflow/core/logical/sql/sql_planner.h", ], visibility = ["//visibility:public"], ) @@ -18,26 +18,23 @@ filegroup( filegroup( name = "velaria_core_execution_sources", srcs = [ - "src/dataflow/core/csv.cc", - "src/dataflow/core/csv.h", - "src/dataflow/core/table.cc", - "src/dataflow/core/table.h", - "src/dataflow/core/value.cc", - "src/dataflow/core/value.h", - "src/dataflow/runtime/executor.cc", - "src/dataflow/runtime/executor.h", - "src/dataflow/runtime/job_master.cc", - "src/dataflow/runtime/job_master.h", - "src/dataflow/runtime/observability.h", - "src/dataflow/runtime/rpc_contract.h", - "src/dataflow/runtime/vector_index.cc", - "src/dataflow/runtime/vector_index.h", - "src/dataflow/serial/serializer.cc", - "src/dataflow/serial/serializer.h", - "src/dataflow/stream/binary_row_batch.cc", - "src/dataflow/stream/binary_row_batch.h", - "src/dataflow/stream/stream.cc", - "src/dataflow/stream/stream.h", + "src/dataflow/core/execution/csv.cc", + "src/dataflow/core/execution/csv.h", + "src/dataflow/core/execution/table.cc", + "src/dataflow/core/execution/table.h", + "src/dataflow/core/execution/value.cc", + "src/dataflow/core/execution/value.h", + "src/dataflow/core/execution/runtime/executor.cc", + "src/dataflow/core/execution/runtime/executor.h", + "src/dataflow/core/execution/runtime/observability.h", + "src/dataflow/core/execution/runtime/vector_index.cc", + "src/dataflow/core/execution/runtime/vector_index.h", + "src/dataflow/core/execution/serial/serializer.cc", + "src/dataflow/core/execution/serial/serializer.h", + "src/dataflow/core/execution/stream/binary_row_batch.cc", + "src/dataflow/core/execution/stream/binary_row_batch.h", + "src/dataflow/core/execution/stream/stream.cc", + "src/dataflow/core/execution/stream/stream.h", ], visibility = ["//visibility:public"], ) @@ -45,13 +42,13 @@ filegroup( filegroup( name = "velaria_core_contract_sources", srcs = [ - "src/dataflow/api/dataframe.cc", - "src/dataflow/api/dataframe.h", - "src/dataflow/api/session.cc", - "src/dataflow/api/session.h", - "src/dataflow/catalog/catalog.cc", - "src/dataflow/catalog/catalog.h", - "src/dataflow/stream/source_sink_abi.h", + "src/dataflow/core/contract/api/dataframe.cc", + "src/dataflow/core/contract/api/dataframe.h", + "src/dataflow/core/contract/api/session.cc", + "src/dataflow/core/contract/api/session.h", + "src/dataflow/core/contract/catalog/catalog.cc", + "src/dataflow/core/contract/catalog/catalog.h", + "src/dataflow/core/contract/source_sink_abi.h", ], visibility = ["//visibility:public"], ) @@ -62,7 +59,7 @@ filegroup( "//python_api:velaria_python_supported_sources", "//python_api:velaria_python_example_sources", "//python_api:velaria_python_experimental_sources", - "src/dataflow/python/python_module.cc", + "src/dataflow/interop/python/python_module.cc", ], visibility = ["//visibility:public"], ) @@ -72,24 +69,27 @@ filegroup( srcs = [ "src/dataflow/ai/plugin_runtime.cc", "src/dataflow/ai/plugin_runtime.h", - "src/dataflow/rpc/actor_rpc_codec.cc", - "src/dataflow/rpc/actor_rpc_codec.h", - "src/dataflow/rpc/rpc_codec.cc", - "src/dataflow/rpc/rpc_codec.h", - "src/dataflow/rpc/serialization.cc", - "src/dataflow/rpc/serialization.h", - "src/dataflow/runner/actor_runtime.cc", - "src/dataflow/runner/actor_runtime.h", - "src/dataflow/runtime/actor_runtime.cc", - "src/dataflow/runtime/actor_runtime.h", - "src/dataflow/runtime/byte_transport.cc", - "src/dataflow/runtime/byte_transport.h", - "src/dataflow/runtime/rpc_runner.cc", - "src/dataflow/runtime/rpc_runner.h", - "src/dataflow/stream/actor_stream_runtime.cc", - "src/dataflow/stream/actor_stream_runtime.h", - "src/dataflow/transport/ipc_transport.cc", - "src/dataflow/transport/ipc_transport.h", + "src/dataflow/experimental/rpc/actor_rpc_codec.cc", + "src/dataflow/experimental/rpc/actor_rpc_codec.h", + "src/dataflow/experimental/rpc/rpc_codec.cc", + "src/dataflow/experimental/rpc/rpc_codec.h", + "src/dataflow/experimental/rpc/serialization.cc", + "src/dataflow/experimental/rpc/serialization.h", + "src/dataflow/experimental/runner/actor_runtime.cc", + "src/dataflow/experimental/runner/actor_runtime.h", + "src/dataflow/experimental/runtime/actor_runtime.cc", + "src/dataflow/experimental/runtime/actor_runtime.h", + "src/dataflow/experimental/runtime/byte_transport.cc", + "src/dataflow/experimental/runtime/byte_transport.h", + "src/dataflow/experimental/runtime/job_master.cc", + "src/dataflow/experimental/runtime/job_master.h", + "src/dataflow/experimental/runtime/rpc_contract.h", + "src/dataflow/experimental/runtime/rpc_runner.cc", + "src/dataflow/experimental/runtime/rpc_runner.h", + "src/dataflow/experimental/stream/actor_stream_runtime.cc", + "src/dataflow/experimental/stream/actor_stream_runtime.h", + "src/dataflow/experimental/transport/ipc_transport.cc", + "src/dataflow/experimental/transport/ipc_transport.h", ], visibility = ["//visibility:public"], ) @@ -103,55 +103,55 @@ filegroup( cc_library( name = "dataflow_core", srcs = [ - "src/dataflow/core/value.cc", - "src/dataflow/core/table.cc", - "src/dataflow/core/csv.cc", - "src/dataflow/catalog/catalog.cc", - "src/dataflow/api/dataframe.cc", - "src/dataflow/api/session.cc", - "src/dataflow/serial/serializer.cc", - "src/dataflow/planner/plan.cc", - "src/dataflow/runtime/executor.cc", - "src/dataflow/runtime/job_master.cc", - "src/dataflow/runtime/byte_transport.cc", - "src/dataflow/runtime/rpc_runner.cc", - "src/dataflow/runtime/vector_index.cc", - "src/dataflow/rpc/rpc_codec.cc", - "src/dataflow/transport/ipc_transport.cc", + "src/dataflow/core/execution/value.cc", + "src/dataflow/core/execution/table.cc", + "src/dataflow/core/execution/csv.cc", + "src/dataflow/core/contract/catalog/catalog.cc", + "src/dataflow/core/contract/api/dataframe.cc", + "src/dataflow/core/contract/api/session.cc", + "src/dataflow/core/execution/serial/serializer.cc", + "src/dataflow/core/logical/planner/plan.cc", + "src/dataflow/core/execution/runtime/executor.cc", + "src/dataflow/experimental/runtime/job_master.cc", + "src/dataflow/experimental/runtime/byte_transport.cc", + "src/dataflow/experimental/runtime/rpc_runner.cc", + "src/dataflow/core/execution/runtime/vector_index.cc", + "src/dataflow/experimental/rpc/rpc_codec.cc", + "src/dataflow/experimental/transport/ipc_transport.cc", "src/dataflow/ai/plugin_runtime.cc", - "src/dataflow/sql/sql_parser.cc", - "src/dataflow/sql/sql_planner.cc", - "src/dataflow/stream/binary_row_batch.cc", - "src/dataflow/stream/actor_stream_runtime.cc", - "src/dataflow/stream/stream.cc", + "src/dataflow/core/logical/sql/sql_parser.cc", + "src/dataflow/core/logical/sql/sql_planner.cc", + "src/dataflow/core/execution/stream/binary_row_batch.cc", + "src/dataflow/experimental/stream/actor_stream_runtime.cc", + "src/dataflow/core/execution/stream/stream.cc", ], hdrs = [ - "src/dataflow/core/value.h", - "src/dataflow/core/table.h", - "src/dataflow/core/csv.h", - "src/dataflow/catalog/catalog.h", - "src/dataflow/api/dataframe.h", - "src/dataflow/api/session.h", - "src/dataflow/serial/serializer.h", - "src/dataflow/sql/sql_ast.h", - "src/dataflow/sql/sql_errors.h", - "src/dataflow/sql/sql_parser.h", - "src/dataflow/sql/sql_planner.h", - "src/dataflow/planner/plan.h", - "src/dataflow/runtime/executor.h", - "src/dataflow/runtime/byte_transport.h", - "src/dataflow/runtime/rpc_contract.h", - "src/dataflow/runtime/job_master.h", - "src/dataflow/runtime/observability.h", - "src/dataflow/runtime/rpc_runner.h", - "src/dataflow/runtime/vector_index.h", + "src/dataflow/core/execution/value.h", + "src/dataflow/core/execution/table.h", + "src/dataflow/core/execution/csv.h", + "src/dataflow/core/contract/catalog/catalog.h", + "src/dataflow/core/contract/api/dataframe.h", + "src/dataflow/core/contract/api/session.h", + "src/dataflow/core/execution/serial/serializer.h", + "src/dataflow/core/logical/sql/sql_ast.h", + "src/dataflow/core/logical/sql/sql_errors.h", + "src/dataflow/core/logical/sql/sql_parser.h", + "src/dataflow/core/logical/sql/sql_planner.h", + "src/dataflow/core/logical/planner/plan.h", + "src/dataflow/core/execution/runtime/executor.h", + "src/dataflow/experimental/runtime/byte_transport.h", + "src/dataflow/experimental/runtime/rpc_contract.h", + "src/dataflow/experimental/runtime/job_master.h", + "src/dataflow/core/execution/runtime/observability.h", + "src/dataflow/experimental/runtime/rpc_runner.h", + "src/dataflow/core/execution/runtime/vector_index.h", "src/dataflow/ai/plugin_runtime.h", - "src/dataflow/rpc/rpc_codec.h", - "src/dataflow/transport/ipc_transport.h", - "src/dataflow/stream/binary_row_batch.h", - "src/dataflow/stream/actor_stream_runtime.h", - "src/dataflow/stream/source_sink_abi.h", - "src/dataflow/stream/stream.h", + "src/dataflow/experimental/rpc/rpc_codec.h", + "src/dataflow/experimental/transport/ipc_transport.h", + "src/dataflow/core/execution/stream/binary_row_batch.h", + "src/dataflow/experimental/stream/actor_stream_runtime.h", + "src/dataflow/core/contract/source_sink_abi.h", + "src/dataflow/core/execution/stream/stream.h", ], visibility = ["//visibility:public"], ) @@ -208,7 +208,7 @@ cc_binary( cc_binary( name = "_velaria_native", - srcs = ["src/dataflow/python/python_module.cc"], + srcs = ["src/dataflow/interop/python/python_module.cc"], linkopts = select({ "@bazel_tools//src/conditions:darwin": [ "-undefined", @@ -265,12 +265,12 @@ cc_binary( cc_library( name = "dataflow_actor_rpc_codec", srcs = [ - "src/dataflow/rpc/actor_rpc_codec.cc", - "src/dataflow/rpc/serialization.cc", + "src/dataflow/experimental/rpc/actor_rpc_codec.cc", + "src/dataflow/experimental/rpc/serialization.cc", ], hdrs = [ - "src/dataflow/rpc/actor_rpc_codec.h", - "src/dataflow/rpc/serialization.h", + "src/dataflow/experimental/rpc/actor_rpc_codec.h", + "src/dataflow/experimental/rpc/serialization.h", ], deps = [":dataflow_core"], visibility = ["//visibility:public"], @@ -291,10 +291,10 @@ cc_library( cc_library( name = "dataflow_actor_runtime", srcs = [ - "src/dataflow/runtime/actor_runtime.cc", + "src/dataflow/experimental/runtime/actor_runtime.cc", ], hdrs = [ - "src/dataflow/runtime/actor_runtime.h", + "src/dataflow/experimental/runtime/actor_runtime.h", ], deps = [ ":dataflow_core", @@ -305,10 +305,10 @@ cc_library( cc_library( name = "dataflow_actor_runner", srcs = [ - "src/dataflow/runner/actor_runtime.cc", + "src/dataflow/experimental/runner/actor_runtime.cc", ], hdrs = [ - "src/dataflow/runner/actor_runtime.h", + "src/dataflow/experimental/runner/actor_runtime.h", ], deps = [ ":dataflow_actor_rpc_codec", diff --git a/docs/core-boundary.md b/docs/core-boundary.md index 5c82a38..1f6bb66 100644 --- a/docs/core-boundary.md +++ b/docs/core-boundary.md @@ -144,11 +144,18 @@ Repository view: ## Full-Reorg Note -This reorg is implemented first through repository-facing structure: - -- layered Bazel source groups -- layered regression suites -- layered documentation -- README and Python ecosystem reordering - -Source paths still live under the existing `src/dataflow` and `python_api` roots so the current build graph and examples remain stable while the layer boundaries become explicit and enforceable. +This reorg is now implemented in two layers: + +- repository-facing structure: + - layered Bazel source groups + - layered regression suites + - layered documentation + - README and Python ecosystem reordering +- physical source layout: + - `src/dataflow/core/logical` + - `src/dataflow/core/execution` + - `src/dataflow/core/contract` + - `src/dataflow/interop` + - `src/dataflow/experimental` + +The build graph is still being separated incrementally. Some targets still depend across layers for compatibility while the physical directory split is established and validated. diff --git a/src/dataflow/api/dataframe.cc b/src/dataflow/core/contract/api/dataframe.cc similarity index 99% rename from src/dataflow/api/dataframe.cc rename to src/dataflow/core/contract/api/dataframe.cc index d2d253a..0454eea 100644 --- a/src/dataflow/api/dataframe.cc +++ b/src/dataflow/core/contract/api/dataframe.cc @@ -1,4 +1,4 @@ -#include "src/dataflow/api/dataframe.h" +#include "src/dataflow/core/contract/api/dataframe.h" #include #include @@ -7,7 +7,7 @@ #include #include "src/dataflow/ai/plugin_runtime.h" -#include "src/dataflow/runtime/vector_index.h" +#include "src/dataflow/core/execution/runtime/vector_index.h" namespace dataflow { diff --git a/src/dataflow/api/dataframe.h b/src/dataflow/core/contract/api/dataframe.h similarity index 93% rename from src/dataflow/api/dataframe.h rename to src/dataflow/core/contract/api/dataframe.h index f31615d..9b58e93 100644 --- a/src/dataflow/api/dataframe.h +++ b/src/dataflow/core/contract/api/dataframe.h @@ -6,10 +6,10 @@ #include #include -#include "src/dataflow/core/table.h" -#include "src/dataflow/planner/plan.h" -#include "src/dataflow/runtime/executor.h" -#include "src/dataflow/runtime/job_master.h" +#include "src/dataflow/core/execution/table.h" +#include "src/dataflow/core/logical/planner/plan.h" +#include "src/dataflow/core/execution/runtime/executor.h" +#include "src/dataflow/experimental/runtime/job_master.h" namespace dataflow { diff --git a/src/dataflow/api/session.cc b/src/dataflow/core/contract/api/session.cc similarity index 99% rename from src/dataflow/api/session.cc rename to src/dataflow/core/contract/api/session.cc index 66f980e..6eb79db 100644 --- a/src/dataflow/api/session.cc +++ b/src/dataflow/core/contract/api/session.cc @@ -1,4 +1,4 @@ -#include "src/dataflow/api/session.h" +#include "src/dataflow/core/contract/api/session.h" #include #include @@ -11,7 +11,7 @@ #include #include -#include "src/dataflow/core/csv.h" +#include "src/dataflow/core/execution/csv.h" #include "src/dataflow/ai/plugin_runtime.h" namespace dataflow { diff --git a/src/dataflow/api/session.h b/src/dataflow/core/contract/api/session.h similarity index 88% rename from src/dataflow/api/session.h rename to src/dataflow/core/contract/api/session.h index b6ce22a..a48a7b6 100644 --- a/src/dataflow/api/session.h +++ b/src/dataflow/core/contract/api/session.h @@ -1,10 +1,10 @@ #pragma once -#include "src/dataflow/api/dataframe.h" -#include "src/dataflow/catalog/catalog.h" -#include "src/dataflow/stream/stream.h" -#include "src/dataflow/sql/sql_parser.h" -#include "src/dataflow/sql/sql_planner.h" +#include "src/dataflow/core/contract/api/dataframe.h" +#include "src/dataflow/core/contract/catalog/catalog.h" +#include "src/dataflow/core/execution/stream/stream.h" +#include "src/dataflow/core/logical/sql/sql_parser.h" +#include "src/dataflow/core/logical/sql/sql_planner.h" #include namespace dataflow { diff --git a/src/dataflow/catalog/catalog.cc b/src/dataflow/core/contract/catalog/catalog.cc similarity index 97% rename from src/dataflow/catalog/catalog.cc rename to src/dataflow/core/contract/catalog/catalog.cc index afed6ec..b8b729a 100644 --- a/src/dataflow/catalog/catalog.cc +++ b/src/dataflow/core/contract/catalog/catalog.cc @@ -1,4 +1,4 @@ -#include "src/dataflow/catalog/catalog.h" +#include "src/dataflow/core/contract/catalog/catalog.h" namespace dataflow { diff --git a/src/dataflow/catalog/catalog.h b/src/dataflow/core/contract/catalog/catalog.h similarity index 82% rename from src/dataflow/catalog/catalog.h rename to src/dataflow/core/contract/catalog/catalog.h index 10dff8d..99b4860 100644 --- a/src/dataflow/catalog/catalog.h +++ b/src/dataflow/core/contract/catalog/catalog.h @@ -5,10 +5,10 @@ #include #include -#include "src/dataflow/api/dataframe.h" -#include "src/dataflow/sql/sql_errors.h" -#include "src/dataflow/core/table.h" -#include "src/dataflow/sql/sql_ast.h" +#include "src/dataflow/core/contract/api/dataframe.h" +#include "src/dataflow/core/logical/sql/sql_errors.h" +#include "src/dataflow/core/execution/table.h" +#include "src/dataflow/core/logical/sql/sql_ast.h" namespace dataflow { diff --git a/src/dataflow/stream/source_sink_abi.h b/src/dataflow/core/contract/source_sink_abi.h similarity index 97% rename from src/dataflow/stream/source_sink_abi.h rename to src/dataflow/core/contract/source_sink_abi.h index 9ea629d..106a8b2 100644 --- a/src/dataflow/stream/source_sink_abi.h +++ b/src/dataflow/core/contract/source_sink_abi.h @@ -4,7 +4,7 @@ #include #include -#include "src/dataflow/core/table.h" +#include "src/dataflow/core/execution/table.h" namespace dataflow { diff --git a/src/dataflow/core/csv.cc b/src/dataflow/core/execution/csv.cc similarity index 98% rename from src/dataflow/core/csv.cc rename to src/dataflow/core/execution/csv.cc index 3caa11b..3899804 100644 --- a/src/dataflow/core/csv.cc +++ b/src/dataflow/core/execution/csv.cc @@ -1,4 +1,4 @@ -#include "src/dataflow/core/csv.h" +#include "src/dataflow/core/execution/csv.h" #include #include diff --git a/src/dataflow/core/csv.h b/src/dataflow/core/execution/csv.h similarity index 81% rename from src/dataflow/core/csv.h rename to src/dataflow/core/execution/csv.h index f4b7c77..3154562 100644 --- a/src/dataflow/core/csv.h +++ b/src/dataflow/core/execution/csv.h @@ -2,7 +2,7 @@ #include -#include "src/dataflow/core/table.h" +#include "src/dataflow/core/execution/table.h" namespace dataflow { diff --git a/src/dataflow/runtime/executor.cc b/src/dataflow/core/execution/runtime/executor.cc similarity index 99% rename from src/dataflow/runtime/executor.cc rename to src/dataflow/core/execution/runtime/executor.cc index 18c97bc..d0b215b 100644 --- a/src/dataflow/runtime/executor.cc +++ b/src/dataflow/core/execution/runtime/executor.cc @@ -1,4 +1,4 @@ -#include "src/dataflow/runtime/executor.h" +#include "src/dataflow/core/execution/runtime/executor.h" #include #include @@ -8,7 +8,7 @@ #include #include -#include "src/dataflow/runtime/rpc_runner.h" +#include "src/dataflow/experimental/runtime/rpc_runner.h" namespace dataflow { diff --git a/src/dataflow/runtime/executor.h b/src/dataflow/core/execution/runtime/executor.h similarity index 94% rename from src/dataflow/runtime/executor.h rename to src/dataflow/core/execution/runtime/executor.h index 1d628d7..a5e856b 100644 --- a/src/dataflow/runtime/executor.h +++ b/src/dataflow/core/execution/runtime/executor.h @@ -3,7 +3,7 @@ #include #include -#include "src/dataflow/planner/plan.h" +#include "src/dataflow/core/logical/planner/plan.h" namespace dataflow { diff --git a/src/dataflow/runtime/observability.h b/src/dataflow/core/execution/runtime/observability.h similarity index 100% rename from src/dataflow/runtime/observability.h rename to src/dataflow/core/execution/runtime/observability.h diff --git a/src/dataflow/runtime/vector_index.cc b/src/dataflow/core/execution/runtime/vector_index.cc similarity index 99% rename from src/dataflow/runtime/vector_index.cc rename to src/dataflow/core/execution/runtime/vector_index.cc index d23a03d..f9f1b1b 100644 --- a/src/dataflow/runtime/vector_index.cc +++ b/src/dataflow/core/execution/runtime/vector_index.cc @@ -1,4 +1,4 @@ -#include "src/dataflow/runtime/vector_index.h" +#include "src/dataflow/core/execution/runtime/vector_index.h" #include #include diff --git a/src/dataflow/runtime/vector_index.h b/src/dataflow/core/execution/runtime/vector_index.h similarity index 100% rename from src/dataflow/runtime/vector_index.h rename to src/dataflow/core/execution/runtime/vector_index.h diff --git a/src/dataflow/serial/serializer.cc b/src/dataflow/core/execution/serial/serializer.cc similarity index 99% rename from src/dataflow/serial/serializer.cc rename to src/dataflow/core/execution/serial/serializer.cc index 359f199..d0c970c 100644 --- a/src/dataflow/serial/serializer.cc +++ b/src/dataflow/core/execution/serial/serializer.cc @@ -1,4 +1,4 @@ -#include "src/dataflow/serial/serializer.h" +#include "src/dataflow/core/execution/serial/serializer.h" #include #include diff --git a/src/dataflow/serial/serializer.h b/src/dataflow/core/execution/serial/serializer.h similarity index 95% rename from src/dataflow/serial/serializer.h rename to src/dataflow/core/execution/serial/serializer.h index 2877ec3..41b0dda 100644 --- a/src/dataflow/serial/serializer.h +++ b/src/dataflow/core/execution/serial/serializer.h @@ -3,7 +3,7 @@ #include #include -#include "src/dataflow/core/table.h" +#include "src/dataflow/core/execution/table.h" namespace dataflow { diff --git a/src/dataflow/stream/binary_row_batch.cc b/src/dataflow/core/execution/stream/binary_row_batch.cc similarity index 99% rename from src/dataflow/stream/binary_row_batch.cc rename to src/dataflow/core/execution/stream/binary_row_batch.cc index e3c6774..b3ea5c8 100644 --- a/src/dataflow/stream/binary_row_batch.cc +++ b/src/dataflow/core/execution/stream/binary_row_batch.cc @@ -1,4 +1,4 @@ -#include "src/dataflow/stream/binary_row_batch.h" +#include "src/dataflow/core/execution/stream/binary_row_batch.h" #include #include diff --git a/src/dataflow/stream/binary_row_batch.h b/src/dataflow/core/execution/stream/binary_row_batch.h similarity index 99% rename from src/dataflow/stream/binary_row_batch.h rename to src/dataflow/core/execution/stream/binary_row_batch.h index 60004e2..2a1cdc9 100644 --- a/src/dataflow/stream/binary_row_batch.h +++ b/src/dataflow/core/execution/stream/binary_row_batch.h @@ -10,7 +10,7 @@ #include #include -#include "src/dataflow/core/table.h" +#include "src/dataflow/core/execution/table.h" namespace dataflow { diff --git a/src/dataflow/stream/stream.cc b/src/dataflow/core/execution/stream/stream.cc similarity index 99% rename from src/dataflow/stream/stream.cc rename to src/dataflow/core/execution/stream/stream.cc index 018a755..7601988 100644 --- a/src/dataflow/stream/stream.cc +++ b/src/dataflow/core/execution/stream/stream.cc @@ -1,4 +1,4 @@ -#include "src/dataflow/stream/stream.h" +#include "src/dataflow/core/execution/stream/stream.h" #include #include @@ -24,9 +24,9 @@ #endif #include "src/dataflow/ai/plugin_runtime.h" -#include "src/dataflow/api/dataframe.h" -#include "src/dataflow/core/csv.h" -#include "src/dataflow/stream/actor_stream_runtime.h" +#include "src/dataflow/core/contract/api/dataframe.h" +#include "src/dataflow/core/execution/csv.h" +#include "src/dataflow/experimental/stream/actor_stream_runtime.h" namespace dataflow { diff --git a/src/dataflow/stream/stream.h b/src/dataflow/core/execution/stream/stream.h similarity index 99% rename from src/dataflow/stream/stream.h rename to src/dataflow/core/execution/stream/stream.h index c3a62bc..dd74606 100644 --- a/src/dataflow/stream/stream.h +++ b/src/dataflow/core/execution/stream/stream.h @@ -11,8 +11,8 @@ #include #include -#include "src/dataflow/core/table.h" -#include "src/dataflow/stream/source_sink_abi.h" +#include "src/dataflow/core/execution/table.h" +#include "src/dataflow/core/contract/source_sink_abi.h" namespace dataflow { diff --git a/src/dataflow/core/table.cc b/src/dataflow/core/execution/table.cc similarity index 91% rename from src/dataflow/core/table.cc rename to src/dataflow/core/execution/table.cc index 06b298b..e1d176c 100644 --- a/src/dataflow/core/table.cc +++ b/src/dataflow/core/execution/table.cc @@ -1,4 +1,4 @@ -#include "src/dataflow/core/table.h" +#include "src/dataflow/core/execution/table.h" #include diff --git a/src/dataflow/core/table.h b/src/dataflow/core/execution/table.h similarity index 93% rename from src/dataflow/core/table.h rename to src/dataflow/core/execution/table.h index 7e23561..3e217d5 100644 --- a/src/dataflow/core/table.h +++ b/src/dataflow/core/execution/table.h @@ -4,7 +4,7 @@ #include #include -#include "src/dataflow/core/value.h" +#include "src/dataflow/core/execution/value.h" namespace dataflow { diff --git a/src/dataflow/core/value.cc b/src/dataflow/core/execution/value.cc similarity index 59% rename from src/dataflow/core/value.cc rename to src/dataflow/core/execution/value.cc index 7d612f8..1123fd6 100644 --- a/src/dataflow/core/value.cc +++ b/src/dataflow/core/execution/value.cc @@ -1,4 +1,4 @@ -#include "src/dataflow/core/value.h" +#include "src/dataflow/core/execution/value.h" namespace dataflow { // All methods are inline in header for v0.1. diff --git a/src/dataflow/core/value.h b/src/dataflow/core/execution/value.h similarity index 100% rename from src/dataflow/core/value.h rename to src/dataflow/core/execution/value.h diff --git a/src/dataflow/planner/plan.cc b/src/dataflow/core/logical/planner/plan.cc similarity index 99% rename from src/dataflow/planner/plan.cc rename to src/dataflow/core/logical/planner/plan.cc index 7f92bda..5292760 100644 --- a/src/dataflow/planner/plan.cc +++ b/src/dataflow/core/logical/planner/plan.cc @@ -1,4 +1,4 @@ -#include "src/dataflow/planner/plan.h" +#include "src/dataflow/core/logical/planner/plan.h" #include #include @@ -7,7 +7,7 @@ #include #include -#include "src/dataflow/serial/serializer.h" +#include "src/dataflow/core/execution/serial/serializer.h" namespace dataflow { diff --git a/src/dataflow/planner/plan.h b/src/dataflow/core/logical/planner/plan.h similarity index 98% rename from src/dataflow/planner/plan.h rename to src/dataflow/core/logical/planner/plan.h index d6be2a5..a568786 100644 --- a/src/dataflow/planner/plan.h +++ b/src/dataflow/core/logical/planner/plan.h @@ -4,7 +4,7 @@ #include #include -#include "src/dataflow/core/table.h" +#include "src/dataflow/core/execution/table.h" namespace dataflow { diff --git a/src/dataflow/sql/sql_ast.h b/src/dataflow/core/logical/sql/sql_ast.h similarity index 95% rename from src/dataflow/sql/sql_ast.h rename to src/dataflow/core/logical/sql/sql_ast.h index 40a9306..883f5f8 100644 --- a/src/dataflow/sql/sql_ast.h +++ b/src/dataflow/core/logical/sql/sql_ast.h @@ -5,8 +5,8 @@ #include #include -#include "src/dataflow/core/table.h" -#include "src/dataflow/core/value.h" +#include "src/dataflow/core/execution/table.h" +#include "src/dataflow/core/execution/value.h" namespace dataflow { namespace sql { diff --git a/src/dataflow/sql/sql_errors.h b/src/dataflow/core/logical/sql/sql_errors.h similarity index 100% rename from src/dataflow/sql/sql_errors.h rename to src/dataflow/core/logical/sql/sql_errors.h diff --git a/src/dataflow/sql/sql_parser.cc b/src/dataflow/core/logical/sql/sql_parser.cc similarity index 99% rename from src/dataflow/sql/sql_parser.cc rename to src/dataflow/core/logical/sql/sql_parser.cc index 379ba7c..b458338 100644 --- a/src/dataflow/sql/sql_parser.cc +++ b/src/dataflow/core/logical/sql/sql_parser.cc @@ -1,4 +1,4 @@ -#include "src/dataflow/sql/sql_parser.h" +#include "src/dataflow/core/logical/sql/sql_parser.h" #include #include diff --git a/src/dataflow/sql/sql_parser.h b/src/dataflow/core/logical/sql/sql_parser.h similarity index 67% rename from src/dataflow/sql/sql_parser.h rename to src/dataflow/core/logical/sql/sql_parser.h index 7f0613c..ca08ba8 100644 --- a/src/dataflow/sql/sql_parser.h +++ b/src/dataflow/core/logical/sql/sql_parser.h @@ -3,8 +3,8 @@ #include #include -#include "src/dataflow/sql/sql_ast.h" -#include "src/dataflow/sql/sql_errors.h" +#include "src/dataflow/core/logical/sql/sql_ast.h" +#include "src/dataflow/core/logical/sql/sql_errors.h" namespace dataflow { namespace sql { diff --git a/src/dataflow/sql/sql_planner.cc b/src/dataflow/core/logical/sql/sql_planner.cc similarity index 99% rename from src/dataflow/sql/sql_planner.cc rename to src/dataflow/core/logical/sql/sql_planner.cc index 255b47c..0b10802 100644 --- a/src/dataflow/sql/sql_planner.cc +++ b/src/dataflow/core/logical/sql/sql_planner.cc @@ -1,4 +1,4 @@ -#include "src/dataflow/sql/sql_planner.h" +#include "src/dataflow/core/logical/sql/sql_planner.h" #include #include diff --git a/src/dataflow/sql/sql_planner.h b/src/dataflow/core/logical/sql/sql_planner.h similarity index 93% rename from src/dataflow/sql/sql_planner.h rename to src/dataflow/core/logical/sql/sql_planner.h index 1506a0a..d3ed70d 100644 --- a/src/dataflow/sql/sql_planner.h +++ b/src/dataflow/core/logical/sql/sql_planner.h @@ -4,10 +4,10 @@ #include #include -#include "src/dataflow/api/dataframe.h" -#include "src/dataflow/catalog/catalog.h" -#include "src/dataflow/stream/stream.h" -#include "src/dataflow/sql/sql_ast.h" +#include "src/dataflow/core/contract/api/dataframe.h" +#include "src/dataflow/core/contract/catalog/catalog.h" +#include "src/dataflow/core/execution/stream/stream.h" +#include "src/dataflow/core/logical/sql/sql_ast.h" namespace dataflow { namespace sql { diff --git a/src/dataflow/examples/actor_rpc_client.cc b/src/dataflow/examples/actor_rpc_client.cc index 5d3c74e..bd43c03 100644 --- a/src/dataflow/examples/actor_rpc_client.cc +++ b/src/dataflow/examples/actor_rpc_client.cc @@ -1,6 +1,6 @@ #include -#include "src/dataflow/runner/actor_runtime.h" +#include "src/dataflow/experimental/runner/actor_runtime.h" int main(int argc, char* argv[]) { dataflow::ActorRuntimeConfig config; diff --git a/src/dataflow/examples/actor_rpc_scheduler.cc b/src/dataflow/examples/actor_rpc_scheduler.cc index 11f14a9..7018293 100644 --- a/src/dataflow/examples/actor_rpc_scheduler.cc +++ b/src/dataflow/examples/actor_rpc_scheduler.cc @@ -2,7 +2,7 @@ #include -#include "src/dataflow/runner/actor_runtime.h" +#include "src/dataflow/experimental/runner/actor_runtime.h" int main(int argc, char* argv[]) { dataflow::ActorRuntimeConfig config; diff --git a/src/dataflow/examples/actor_rpc_smoke.cc b/src/dataflow/examples/actor_rpc_smoke.cc index deb1e8f..de42d6e 100644 --- a/src/dataflow/examples/actor_rpc_smoke.cc +++ b/src/dataflow/examples/actor_rpc_smoke.cc @@ -1,4 +1,4 @@ -#include "src/dataflow/runner/actor_runtime.h" +#include "src/dataflow/experimental/runner/actor_runtime.h" int main() { return dataflow::runActorSmoke(); diff --git a/src/dataflow/examples/actor_rpc_worker.cc b/src/dataflow/examples/actor_rpc_worker.cc index b8eb193..727c783 100644 --- a/src/dataflow/examples/actor_rpc_worker.cc +++ b/src/dataflow/examples/actor_rpc_worker.cc @@ -1,6 +1,6 @@ #include -#include "src/dataflow/runner/actor_runtime.h" +#include "src/dataflow/experimental/runner/actor_runtime.h" int main(int argc, char* argv[]) { dataflow::ActorRuntimeConfig config; diff --git a/src/dataflow/examples/dataframe_demo.cc b/src/dataflow/examples/dataframe_demo.cc index a91c460..c9c7788 100644 --- a/src/dataflow/examples/dataframe_demo.cc +++ b/src/dataflow/examples/dataframe_demo.cc @@ -1,5 +1,5 @@ -#include "src/dataflow/api/dataframe.h" -#include "src/dataflow/api/session.h" +#include "src/dataflow/core/contract/api/dataframe.h" +#include "src/dataflow/core/contract/api/session.h" #include diff --git a/src/dataflow/examples/sql_demo.cc b/src/dataflow/examples/sql_demo.cc index 3791f29..5f9b958 100644 --- a/src/dataflow/examples/sql_demo.cc +++ b/src/dataflow/examples/sql_demo.cc @@ -1,5 +1,5 @@ -#include "src/dataflow/api/dataframe.h" -#include "src/dataflow/api/session.h" +#include "src/dataflow/core/contract/api/dataframe.h" +#include "src/dataflow/core/contract/api/session.h" #include diff --git a/src/dataflow/examples/stream_actor_benchmark.cc b/src/dataflow/examples/stream_actor_benchmark.cc index 3a10e78..4d5e33d 100644 --- a/src/dataflow/examples/stream_actor_benchmark.cc +++ b/src/dataflow/examples/stream_actor_benchmark.cc @@ -6,8 +6,8 @@ #include -#include "src/dataflow/runtime/observability.h" -#include "src/dataflow/stream/actor_stream_runtime.h" +#include "src/dataflow/core/execution/runtime/observability.h" +#include "src/dataflow/experimental/stream/actor_stream_runtime.h" namespace { diff --git a/src/dataflow/examples/stream_benchmark.cc b/src/dataflow/examples/stream_benchmark.cc index 9d2818a..2c61f13 100644 --- a/src/dataflow/examples/stream_benchmark.cc +++ b/src/dataflow/examples/stream_benchmark.cc @@ -6,9 +6,9 @@ #include -#include "src/dataflow/api/session.h" -#include "src/dataflow/runtime/observability.h" -#include "src/dataflow/stream/stream.h" +#include "src/dataflow/core/contract/api/session.h" +#include "src/dataflow/core/execution/runtime/observability.h" +#include "src/dataflow/core/execution/stream/stream.h" namespace { diff --git a/src/dataflow/examples/stream_demo.cc b/src/dataflow/examples/stream_demo.cc index 0d92214..49ab273 100644 --- a/src/dataflow/examples/stream_demo.cc +++ b/src/dataflow/examples/stream_demo.cc @@ -2,8 +2,8 @@ #include #include -#include "src/dataflow/api/session.h" -#include "src/dataflow/stream/stream.h" +#include "src/dataflow/core/contract/api/session.h" +#include "src/dataflow/core/execution/stream/stream.h" int main() { dataflow::DataflowSession& session = dataflow::DataflowSession::builder(); diff --git a/src/dataflow/examples/stream_sql_demo.cc b/src/dataflow/examples/stream_sql_demo.cc index e2627be..1efe725 100644 --- a/src/dataflow/examples/stream_sql_demo.cc +++ b/src/dataflow/examples/stream_sql_demo.cc @@ -3,8 +3,8 @@ #include #include -#include "src/dataflow/api/session.h" -#include "src/dataflow/stream/stream.h" +#include "src/dataflow/core/contract/api/session.h" +#include "src/dataflow/core/execution/stream/stream.h" int main() { namespace fs = std::filesystem; diff --git a/src/dataflow/examples/stream_state_container_demo.cc b/src/dataflow/examples/stream_state_container_demo.cc index e20feb0..5545006 100644 --- a/src/dataflow/examples/stream_state_container_demo.cc +++ b/src/dataflow/examples/stream_state_container_demo.cc @@ -1,7 +1,7 @@ #include #include -#include "src/dataflow/stream/stream.h" +#include "src/dataflow/core/execution/stream/stream.h" int main() { auto state = dataflow::makeStateStore("memory"); diff --git a/src/dataflow/examples/stream_stateful_demo.cc b/src/dataflow/examples/stream_stateful_demo.cc index 7b103b7..73d8c51 100644 --- a/src/dataflow/examples/stream_stateful_demo.cc +++ b/src/dataflow/examples/stream_stateful_demo.cc @@ -2,8 +2,8 @@ #include #include -#include "src/dataflow/api/session.h" -#include "src/dataflow/stream/stream.h" +#include "src/dataflow/core/contract/api/session.h" +#include "src/dataflow/core/execution/stream/stream.h" int main() { dataflow::DataflowSession& session = dataflow::DataflowSession::builder(); diff --git a/src/dataflow/examples/tpch_q1_style_benchmark.cc b/src/dataflow/examples/tpch_q1_style_benchmark.cc index 1d2e28e..cad26e5 100644 --- a/src/dataflow/examples/tpch_q1_style_benchmark.cc +++ b/src/dataflow/examples/tpch_q1_style_benchmark.cc @@ -5,7 +5,7 @@ #include #include -#include "src/dataflow/stream/actor_stream_runtime.h" +#include "src/dataflow/experimental/stream/actor_stream_runtime.h" namespace { diff --git a/src/dataflow/examples/vector_search_benchmark.cc b/src/dataflow/examples/vector_search_benchmark.cc index 376b0fc..9bf3f1f 100644 --- a/src/dataflow/examples/vector_search_benchmark.cc +++ b/src/dataflow/examples/vector_search_benchmark.cc @@ -8,10 +8,10 @@ #include #include -#include "src/dataflow/api/session.h" -#include "src/dataflow/rpc/actor_rpc_codec.h" -#include "src/dataflow/serial/serializer.h" -#include "src/dataflow/stream/binary_row_batch.h" +#include "src/dataflow/core/contract/api/session.h" +#include "src/dataflow/experimental/rpc/actor_rpc_codec.h" +#include "src/dataflow/core/execution/serial/serializer.h" +#include "src/dataflow/core/execution/stream/binary_row_batch.h" namespace { diff --git a/src/dataflow/examples/velaria_cli.cc b/src/dataflow/examples/velaria_cli.cc index 916527a..6de9af9 100644 --- a/src/dataflow/examples/velaria_cli.cc +++ b/src/dataflow/examples/velaria_cli.cc @@ -3,8 +3,8 @@ #include #include -#include "src/dataflow/api/session.h" -#include "src/dataflow/core/value.h" +#include "src/dataflow/core/contract/api/session.h" +#include "src/dataflow/core/execution/value.h" namespace { diff --git a/src/dataflow/examples/wordcount.cc b/src/dataflow/examples/wordcount.cc index 9a4e07e..ebe0061 100644 --- a/src/dataflow/examples/wordcount.cc +++ b/src/dataflow/examples/wordcount.cc @@ -1,4 +1,4 @@ -#include "src/dataflow/api/dataframe.h" +#include "src/dataflow/core/contract/api/dataframe.h" #include #include diff --git a/src/dataflow/rpc/actor_rpc_codec.cc b/src/dataflow/experimental/rpc/actor_rpc_codec.cc similarity index 96% rename from src/dataflow/rpc/actor_rpc_codec.cc rename to src/dataflow/experimental/rpc/actor_rpc_codec.cc index 02de69f..abbfb88 100644 --- a/src/dataflow/rpc/actor_rpc_codec.cc +++ b/src/dataflow/experimental/rpc/actor_rpc_codec.cc @@ -1,8 +1,8 @@ -#include "src/dataflow/rpc/actor_rpc_codec.h" +#include "src/dataflow/experimental/rpc/actor_rpc_codec.h" #include -#include "src/dataflow/rpc/serialization.h" +#include "src/dataflow/experimental/rpc/serialization.h" namespace dataflow { diff --git a/src/dataflow/rpc/actor_rpc_codec.h b/src/dataflow/experimental/rpc/actor_rpc_codec.h similarity index 96% rename from src/dataflow/rpc/actor_rpc_codec.h rename to src/dataflow/experimental/rpc/actor_rpc_codec.h index 83c34c6..14f492b 100644 --- a/src/dataflow/rpc/actor_rpc_codec.h +++ b/src/dataflow/experimental/rpc/actor_rpc_codec.h @@ -5,7 +5,7 @@ #include #include -#include "src/dataflow/runtime/rpc_contract.h" +#include "src/dataflow/experimental/runtime/rpc_contract.h" namespace dataflow { diff --git a/src/dataflow/rpc/rpc_codec.cc b/src/dataflow/experimental/rpc/rpc_codec.cc similarity index 98% rename from src/dataflow/rpc/rpc_codec.cc rename to src/dataflow/experimental/rpc/rpc_codec.cc index 0f7c26a..bfe5ad7 100644 --- a/src/dataflow/rpc/rpc_codec.cc +++ b/src/dataflow/experimental/rpc/rpc_codec.cc @@ -1,11 +1,11 @@ -#include "src/dataflow/rpc/rpc_codec.h" +#include "src/dataflow/experimental/rpc/rpc_codec.h" #include #include #include #include -#include "src/dataflow/stream/binary_row_batch.h" +#include "src/dataflow/core/execution/stream/binary_row_batch.h" namespace dataflow { diff --git a/src/dataflow/rpc/rpc_codec.h b/src/dataflow/experimental/rpc/rpc_codec.h similarity index 98% rename from src/dataflow/rpc/rpc_codec.h rename to src/dataflow/experimental/rpc/rpc_codec.h index 8e43ced..6910367 100644 --- a/src/dataflow/rpc/rpc_codec.h +++ b/src/dataflow/experimental/rpc/rpc_codec.h @@ -6,7 +6,7 @@ #include #include -#include "src/dataflow/core/table.h" +#include "src/dataflow/core/execution/table.h" namespace dataflow { diff --git a/src/dataflow/rpc/serialization.cc b/src/dataflow/experimental/rpc/serialization.cc similarity index 98% rename from src/dataflow/rpc/serialization.cc rename to src/dataflow/experimental/rpc/serialization.cc index baca364..3f474ca 100644 --- a/src/dataflow/rpc/serialization.cc +++ b/src/dataflow/experimental/rpc/serialization.cc @@ -1,4 +1,4 @@ -#include "src/dataflow/rpc/serialization.h" +#include "src/dataflow/experimental/rpc/serialization.h" #include #include @@ -7,7 +7,7 @@ #include #include -#include "src/dataflow/rpc/actor_rpc_codec.h" +#include "src/dataflow/experimental/rpc/actor_rpc_codec.h" namespace dataflow { diff --git a/src/dataflow/rpc/serialization.h b/src/dataflow/experimental/rpc/serialization.h similarity index 91% rename from src/dataflow/rpc/serialization.h rename to src/dataflow/experimental/rpc/serialization.h index 05439a7..029e54d 100644 --- a/src/dataflow/rpc/serialization.h +++ b/src/dataflow/experimental/rpc/serialization.h @@ -4,7 +4,7 @@ #include #include -#include "src/dataflow/runtime/rpc_contract.h" +#include "src/dataflow/experimental/runtime/rpc_contract.h" namespace dataflow { diff --git a/src/dataflow/runner/actor_runtime.cc b/src/dataflow/experimental/runner/actor_runtime.cc similarity index 98% rename from src/dataflow/runner/actor_runtime.cc rename to src/dataflow/experimental/runner/actor_runtime.cc index d3e6b7c..972e438 100644 --- a/src/dataflow/runner/actor_runtime.cc +++ b/src/dataflow/experimental/runner/actor_runtime.cc @@ -1,4 +1,4 @@ -#include "src/dataflow/runner/actor_runtime.h" +#include "src/dataflow/experimental/runner/actor_runtime.h" #include #include @@ -19,15 +19,15 @@ #include #include -#include "src/dataflow/api/dataframe.h" -#include "src/dataflow/api/session.h" -#include "src/dataflow/rpc/actor_rpc_codec.h" -#include "src/dataflow/rpc/rpc_codec.h" -#include "src/dataflow/runtime/job_master.h" -#include "src/dataflow/runtime/observability.h" -#include "src/dataflow/stream/binary_row_batch.h" -#include "src/dataflow/sql/sql_parser.h" -#include "src/dataflow/transport/ipc_transport.h" +#include "src/dataflow/core/contract/api/dataframe.h" +#include "src/dataflow/core/contract/api/session.h" +#include "src/dataflow/experimental/rpc/actor_rpc_codec.h" +#include "src/dataflow/experimental/rpc/rpc_codec.h" +#include "src/dataflow/experimental/runtime/job_master.h" +#include "src/dataflow/core/execution/runtime/observability.h" +#include "src/dataflow/core/execution/stream/binary_row_batch.h" +#include "src/dataflow/core/logical/sql/sql_parser.h" +#include "src/dataflow/experimental/transport/ipc_transport.h" namespace dataflow { diff --git a/src/dataflow/runner/actor_runtime.h b/src/dataflow/experimental/runner/actor_runtime.h similarity index 100% rename from src/dataflow/runner/actor_runtime.h rename to src/dataflow/experimental/runner/actor_runtime.h diff --git a/src/dataflow/runtime/actor_runtime.cc b/src/dataflow/experimental/runtime/actor_runtime.cc similarity index 99% rename from src/dataflow/runtime/actor_runtime.cc rename to src/dataflow/experimental/runtime/actor_runtime.cc index 0bd5fdd..c7187df 100644 --- a/src/dataflow/runtime/actor_runtime.cc +++ b/src/dataflow/experimental/runtime/actor_runtime.cc @@ -1,4 +1,4 @@ -#include "src/dataflow/runtime/actor_runtime.h" +#include "src/dataflow/experimental/runtime/actor_runtime.h" #include diff --git a/src/dataflow/runtime/actor_runtime.h b/src/dataflow/experimental/runtime/actor_runtime.h similarity index 97% rename from src/dataflow/runtime/actor_runtime.h rename to src/dataflow/experimental/runtime/actor_runtime.h index 2b8732b..6137653 100644 --- a/src/dataflow/runtime/actor_runtime.h +++ b/src/dataflow/experimental/runtime/actor_runtime.h @@ -15,9 +15,9 @@ #include #include -#include "src/dataflow/core/table.h" -#include "src/dataflow/runtime/rpc_runner.h" -#include "src/dataflow/runtime/rpc_contract.h" +#include "src/dataflow/core/execution/table.h" +#include "src/dataflow/experimental/runtime/rpc_runner.h" +#include "src/dataflow/experimental/runtime/rpc_contract.h" namespace dataflow { diff --git a/src/dataflow/runtime/byte_transport.cc b/src/dataflow/experimental/runtime/byte_transport.cc similarity index 97% rename from src/dataflow/runtime/byte_transport.cc rename to src/dataflow/experimental/runtime/byte_transport.cc index c09d002..4aaa4c0 100644 --- a/src/dataflow/runtime/byte_transport.cc +++ b/src/dataflow/experimental/runtime/byte_transport.cc @@ -1,4 +1,4 @@ -#include "src/dataflow/runtime/byte_transport.h" +#include "src/dataflow/experimental/runtime/byte_transport.h" #include diff --git a/src/dataflow/runtime/byte_transport.h b/src/dataflow/experimental/runtime/byte_transport.h similarity index 100% rename from src/dataflow/runtime/byte_transport.h rename to src/dataflow/experimental/runtime/byte_transport.h diff --git a/src/dataflow/runtime/job_master.cc b/src/dataflow/experimental/runtime/job_master.cc similarity index 99% rename from src/dataflow/runtime/job_master.cc rename to src/dataflow/experimental/runtime/job_master.cc index fa6625f..9d2476d 100644 --- a/src/dataflow/runtime/job_master.cc +++ b/src/dataflow/experimental/runtime/job_master.cc @@ -1,4 +1,4 @@ -#include "src/dataflow/runtime/job_master.h" +#include "src/dataflow/experimental/runtime/job_master.h" #include #include @@ -8,7 +8,7 @@ #include #include -#include "src/dataflow/runtime/observability.h" +#include "src/dataflow/core/execution/runtime/observability.h" namespace dataflow { diff --git a/src/dataflow/runtime/job_master.h b/src/dataflow/experimental/runtime/job_master.h similarity index 97% rename from src/dataflow/runtime/job_master.h rename to src/dataflow/experimental/runtime/job_master.h index 328b86c..0a534f6 100644 --- a/src/dataflow/runtime/job_master.h +++ b/src/dataflow/experimental/runtime/job_master.h @@ -10,9 +10,9 @@ #include #include -#include "src/dataflow/core/table.h" -#include "src/dataflow/planner/plan.h" -#include "src/dataflow/runtime/executor.h" +#include "src/dataflow/core/execution/table.h" +#include "src/dataflow/core/logical/planner/plan.h" +#include "src/dataflow/core/execution/runtime/executor.h" namespace dataflow { diff --git a/src/dataflow/runtime/rpc_contract.h b/src/dataflow/experimental/runtime/rpc_contract.h similarity index 98% rename from src/dataflow/runtime/rpc_contract.h rename to src/dataflow/experimental/runtime/rpc_contract.h index 618b34c..0446121 100644 --- a/src/dataflow/runtime/rpc_contract.h +++ b/src/dataflow/experimental/runtime/rpc_contract.h @@ -6,8 +6,8 @@ #include #include -#include "src/dataflow/core/table.h" -#include "src/dataflow/sql/sql_errors.h" +#include "src/dataflow/core/execution/table.h" +#include "src/dataflow/core/logical/sql/sql_errors.h" namespace dataflow { diff --git a/src/dataflow/runtime/rpc_runner.cc b/src/dataflow/experimental/runtime/rpc_runner.cc similarity index 99% rename from src/dataflow/runtime/rpc_runner.cc rename to src/dataflow/experimental/runtime/rpc_runner.cc index f291225..6e6ceae 100644 --- a/src/dataflow/runtime/rpc_runner.cc +++ b/src/dataflow/experimental/runtime/rpc_runner.cc @@ -1,4 +1,4 @@ -#include "src/dataflow/runtime/rpc_runner.h" +#include "src/dataflow/experimental/runtime/rpc_runner.h" #include diff --git a/src/dataflow/runtime/rpc_runner.h b/src/dataflow/experimental/runtime/rpc_runner.h similarity index 93% rename from src/dataflow/runtime/rpc_runner.h rename to src/dataflow/experimental/runtime/rpc_runner.h index 1a008c7..1f8ae51 100644 --- a/src/dataflow/runtime/rpc_runner.h +++ b/src/dataflow/experimental/runtime/rpc_runner.h @@ -6,10 +6,10 @@ #include #include -#include "src/dataflow/core/table.h" -#include "src/dataflow/rpc/rpc_codec.h" -#include "src/dataflow/runtime/rpc_contract.h" -#include "src/dataflow/runtime/byte_transport.h" +#include "src/dataflow/core/execution/table.h" +#include "src/dataflow/experimental/rpc/rpc_codec.h" +#include "src/dataflow/experimental/runtime/rpc_contract.h" +#include "src/dataflow/experimental/runtime/byte_transport.h" namespace dataflow { diff --git a/src/dataflow/stream/actor_stream_runtime.cc b/src/dataflow/experimental/stream/actor_stream_runtime.cc similarity index 99% rename from src/dataflow/stream/actor_stream_runtime.cc rename to src/dataflow/experimental/stream/actor_stream_runtime.cc index cc361aa..a276d14 100644 --- a/src/dataflow/stream/actor_stream_runtime.cc +++ b/src/dataflow/experimental/stream/actor_stream_runtime.cc @@ -1,4 +1,4 @@ -#include "src/dataflow/stream/actor_stream_runtime.h" +#include "src/dataflow/experimental/stream/actor_stream_runtime.h" #include #include @@ -17,9 +17,9 @@ #include #include -#include "src/dataflow/api/dataframe.h" -#include "src/dataflow/stream/binary_row_batch.h" -#include "src/dataflow/transport/ipc_transport.h" +#include "src/dataflow/core/contract/api/dataframe.h" +#include "src/dataflow/core/execution/stream/binary_row_batch.h" +#include "src/dataflow/experimental/transport/ipc_transport.h" namespace dataflow { diff --git a/src/dataflow/stream/actor_stream_runtime.h b/src/dataflow/experimental/stream/actor_stream_runtime.h similarity index 98% rename from src/dataflow/stream/actor_stream_runtime.h rename to src/dataflow/experimental/stream/actor_stream_runtime.h index b6cab41..14bb1de 100644 --- a/src/dataflow/stream/actor_stream_runtime.h +++ b/src/dataflow/experimental/stream/actor_stream_runtime.h @@ -5,7 +5,7 @@ #include #include -#include "src/dataflow/core/table.h" +#include "src/dataflow/core/execution/table.h" namespace dataflow { diff --git a/src/dataflow/transport/ipc_transport.cc b/src/dataflow/experimental/transport/ipc_transport.cc similarity index 97% rename from src/dataflow/transport/ipc_transport.cc rename to src/dataflow/experimental/transport/ipc_transport.cc index a580aef..640445c 100644 --- a/src/dataflow/transport/ipc_transport.cc +++ b/src/dataflow/experimental/transport/ipc_transport.cc @@ -1,4 +1,4 @@ -#include "src/dataflow/transport/ipc_transport.h" +#include "src/dataflow/experimental/transport/ipc_transport.h" #include #include @@ -11,7 +11,7 @@ #include #include -#include "src/dataflow/rpc/rpc_codec.h" +#include "src/dataflow/experimental/rpc/rpc_codec.h" namespace dataflow { diff --git a/src/dataflow/transport/ipc_transport.h b/src/dataflow/experimental/transport/ipc_transport.h similarity index 95% rename from src/dataflow/transport/ipc_transport.h rename to src/dataflow/experimental/transport/ipc_transport.h index 2cbd8e4..b53d9b4 100644 --- a/src/dataflow/transport/ipc_transport.h +++ b/src/dataflow/experimental/transport/ipc_transport.h @@ -5,7 +5,7 @@ #include #include -#include "src/dataflow/rpc/rpc_codec.h" +#include "src/dataflow/experimental/rpc/rpc_codec.h" namespace dataflow { diff --git a/src/dataflow/python/python_module.cc b/src/dataflow/interop/python/python_module.cc similarity index 99% rename from src/dataflow/python/python_module.cc rename to src/dataflow/interop/python/python_module.cc index c74b0cd..c3993a3 100644 --- a/src/dataflow/python/python_module.cc +++ b/src/dataflow/interop/python/python_module.cc @@ -10,11 +10,11 @@ #include #include -#include "src/dataflow/api/dataframe.h" -#include "src/dataflow/api/session.h" -#include "src/dataflow/core/table.h" -#include "src/dataflow/core/value.h" -#include "src/dataflow/stream/stream.h" +#include "src/dataflow/core/contract/api/dataframe.h" +#include "src/dataflow/core/contract/api/session.h" +#include "src/dataflow/core/execution/table.h" +#include "src/dataflow/core/execution/value.h" +#include "src/dataflow/core/execution/stream/stream.h" namespace { diff --git a/src/dataflow/tests/planner_v03_test.cc b/src/dataflow/tests/planner_v03_test.cc index 8322f9b..129a497 100644 --- a/src/dataflow/tests/planner_v03_test.cc +++ b/src/dataflow/tests/planner_v03_test.cc @@ -2,8 +2,8 @@ #include #include -#include "src/dataflow/planner/plan.h" -#include "src/dataflow/runtime/executor.h" +#include "src/dataflow/core/logical/planner/plan.h" +#include "src/dataflow/core/execution/runtime/executor.h" namespace { diff --git a/src/dataflow/tests/source_sink_abi_test.cc b/src/dataflow/tests/source_sink_abi_test.cc index 988e554..8adcdd4 100644 --- a/src/dataflow/tests/source_sink_abi_test.cc +++ b/src/dataflow/tests/source_sink_abi_test.cc @@ -4,9 +4,9 @@ #include #include -#include "src/dataflow/api/session.h" -#include "src/dataflow/stream/source_sink_abi.h" -#include "src/dataflow/stream/stream.h" +#include "src/dataflow/core/contract/api/session.h" +#include "src/dataflow/core/contract/source_sink_abi.h" +#include "src/dataflow/core/execution/stream/stream.h" namespace { diff --git a/src/dataflow/tests/sql_regression_test.cc b/src/dataflow/tests/sql_regression_test.cc index e37e6c9..cc513ed 100644 --- a/src/dataflow/tests/sql_regression_test.cc +++ b/src/dataflow/tests/sql_regression_test.cc @@ -8,10 +8,10 @@ #include #include -#include "src/dataflow/catalog/catalog.h" -#include "src/dataflow/api/session.h" -#include "src/dataflow/sql/sql_planner.h" -#include "src/dataflow/sql/sql_parser.h" +#include "src/dataflow/core/contract/catalog/catalog.h" +#include "src/dataflow/core/contract/api/session.h" +#include "src/dataflow/core/logical/sql/sql_planner.h" +#include "src/dataflow/core/logical/sql/sql_parser.h" namespace { diff --git a/src/dataflow/tests/stream_actor_credit_test.cc b/src/dataflow/tests/stream_actor_credit_test.cc index c855ab2..ff7cd08 100644 --- a/src/dataflow/tests/stream_actor_credit_test.cc +++ b/src/dataflow/tests/stream_actor_credit_test.cc @@ -5,7 +5,7 @@ #include #include -#include "src/dataflow/stream/actor_stream_runtime.h" +#include "src/dataflow/experimental/stream/actor_stream_runtime.h" namespace { diff --git a/src/dataflow/tests/stream_runtime_test.cc b/src/dataflow/tests/stream_runtime_test.cc index 332112b..db45cd7 100644 --- a/src/dataflow/tests/stream_runtime_test.cc +++ b/src/dataflow/tests/stream_runtime_test.cc @@ -9,8 +9,8 @@ #include #include -#include "src/dataflow/api/session.h" -#include "src/dataflow/stream/stream.h" +#include "src/dataflow/core/contract/api/session.h" +#include "src/dataflow/core/execution/stream/stream.h" namespace { diff --git a/src/dataflow/tests/stream_strategy_explain_test.cc b/src/dataflow/tests/stream_strategy_explain_test.cc index f1efdf1..6bdf558 100644 --- a/src/dataflow/tests/stream_strategy_explain_test.cc +++ b/src/dataflow/tests/stream_strategy_explain_test.cc @@ -5,8 +5,8 @@ #include #include -#include "src/dataflow/api/session.h" -#include "src/dataflow/stream/stream.h" +#include "src/dataflow/core/contract/api/session.h" +#include "src/dataflow/core/execution/stream/stream.h" namespace { diff --git a/src/dataflow/tests/vector_runtime_test.cc b/src/dataflow/tests/vector_runtime_test.cc index 6773f7c..da11bde 100644 --- a/src/dataflow/tests/vector_runtime_test.cc +++ b/src/dataflow/tests/vector_runtime_test.cc @@ -6,11 +6,11 @@ #include #include -#include "src/dataflow/api/session.h" -#include "src/dataflow/rpc/actor_rpc_codec.h" -#include "src/dataflow/rpc/rpc_codec.h" -#include "src/dataflow/serial/serializer.h" -#include "src/dataflow/stream/binary_row_batch.h" +#include "src/dataflow/core/contract/api/session.h" +#include "src/dataflow/experimental/rpc/actor_rpc_codec.h" +#include "src/dataflow/experimental/rpc/rpc_codec.h" +#include "src/dataflow/core/execution/serial/serializer.h" +#include "src/dataflow/core/execution/stream/binary_row_batch.h" namespace { From 8100315a7ead454a367a5482306563143acc804c Mon Sep 17 00:00:00 2001 From: zuolingxuan Date: Mon, 30 Mar 2026 17:15:54 +0800 Subject: [PATCH 3/3] Align workflows with layered regressions --- .github/workflows/ci.yml | 30 +++++++----------------------- .github/workflows/release.yml | 1 - 2 files changed, 7 insertions(+), 24 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 6098a3e..b768e5a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -29,7 +29,6 @@ jobs: - name: Configure Python for Bazel run: | echo "VELARIA_PYTHON_BIN=$(which python)" >> "$GITHUB_ENV" - echo "VELARIA_PLATFORM_TAG=macosx_11_0_arm64" >> "$GITHUB_ENV" - name: Native Build run: | @@ -37,39 +36,24 @@ jobs: - name: Native Regression run: | - bazel test \ - //:planner_v03_test \ - //:sql_regression_test \ - //:stream_runtime_test \ - //:stream_actor_credit_test \ - //:source_sink_abi_test \ - //:stream_strategy_explain_test + ./scripts/run_core_regression.sh - name: Same-host Observability Regression run: | ./scripts/run_stream_observability_regression.sh - - name: Python Regression + - name: Experimental Regression + run: | + bazel test //:experimental_regression + + - name: Python Ecosystem Regression env: FEISHU_BITABLE_APP_ID: ${{ secrets.FEISHU_BITABLE_APP_ID }} FEISHU_BITABLE_APP_SECRET: ${{ secrets.FEISHU_BITABLE_APP_SECRET }} FEISHU_BITABLE_BASE_URL: ${{ vars.FEISHU_BITABLE_BASE_URL }} FEISHU_BITABLE_OWNER_FIELD: ${{ vars.FEISHU_BITABLE_OWNER_FIELD }} run: | - bazel test \ - //python_api:custom_stream_source_test \ - //python_api:streaming_v05_test \ - //python_api:arrow_stream_ingestion_test \ - //python_api:bitable_stream_source_test \ - //python_api:bitable_group_by_owner_integration_test \ - --test_env=FEISHU_BITABLE_APP_ID \ - --test_env=FEISHU_BITABLE_APP_SECRET \ - --test_env=FEISHU_BITABLE_BASE_URL \ - --test_env=FEISHU_BITABLE_OWNER_FIELD - - - name: Python Smoke - run: | - ./scripts/run_python_ci_checks.sh + ./scripts/run_python_ecosystem_regression.sh wheel-manylinux: if: startsWith(github.ref, 'refs/tags/') || startsWith(github.ref, 'refs/heads/release/') diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 8bf22e4..553197f 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -62,7 +62,6 @@ jobs: - name: Configure Python for Bazel run: | echo "VELARIA_PYTHON_BIN=$(which python)" >> "$GITHUB_ENV" - echo "VELARIA_PLATFORM_TAG=macosx_11_0_arm64" >> "$GITHUB_ENV" - name: Build native wheel run: |