Skip to content

Commit 23f7ba0

Browse files
committed
fix: remove non-columnar formats from R2 storage claim; add SQL parser features
- Remove CSV, JSON, Arrow from "where data lives" section — these formats don't support range requests, only columnar formats (Parquet, Lance, Iceberg) do - Add EXISTS/NOT EXISTS, NATURAL JOIN, USING, parameter binding, SHOW VERSIONS, DIFF to SQL parser with full test coverage
1 parent 776c659 commit 23f7ba0

File tree

7 files changed

+346
-16
lines changed

7 files changed

+346
-16
lines changed

docs/src/content/docs/why-querymode.mdx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ Three analyses on one result set. No SQL string construction, no JSON parsing, n
5858

5959
### Where the data lives
6060

61-
Data sits in **R2** as columnar files (Parquet, Lance, Iceberg, CSV, JSON, Arrow). Nothing gets replicated to 300 edge nodes. Regional Query DOs cache table footers (~4KB each) and read data pages from R2 via coalesced HTTP range requests (~10ms).
61+
Data sits in **R2** as columnar files (Parquet, Lance, Iceberg). Nothing gets replicated to 300 edge nodes. Regional Query DOs cache table footers (~4KB each) and read data pages from R2 via coalesced HTTP range requests (~10ms).
6262

6363
"Data at the edge" means metadata cached locally, pages fetched on demand with free egress. Not replicated databases.
6464

src/sql/ast.ts

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,9 @@ export type SqlExpr =
2626
| { kind: "case_expr"; operand?: SqlExpr; whenClauses: CaseWhen[]; elseResult?: SqlExpr }
2727
| { kind: "cast"; expr: SqlExpr; targetType: string }
2828
| { kind: "star" }
29-
| { kind: "near"; column: SqlExpr; vector: number[]; topK?: number };
29+
| { kind: "near"; column: SqlExpr; vector: number[]; topK?: number }
30+
| { kind: "exists"; subquery: SelectStmt; negated: boolean }
31+
| { kind: "parameter"; index: number };
3032

3133
export interface CaseWhen {
3234
condition: SqlExpr;
@@ -57,6 +59,8 @@ export interface JoinClause {
5759
joinType: JoinType;
5860
table: TableRef;
5961
onCondition?: SqlExpr;
62+
natural?: boolean;
63+
using?: string[];
6064
}
6165

6266
export type TableRef =
@@ -91,3 +95,20 @@ export interface SelectStmt {
9195
offset?: number;
9296
setOperation?: SetOperation;
9397
}
98+
99+
export interface ShowVersionsStmt {
100+
table: string;
101+
limit?: number;
102+
}
103+
104+
export interface DiffStmt {
105+
table: string;
106+
fromVersion: number;
107+
toVersion?: number;
108+
limit?: number;
109+
}
110+
111+
export type SqlStatement =
112+
| { kind: "select"; stmt: SelectStmt }
113+
| { kind: "show_versions"; stmt: ShowVersionsStmt }
114+
| { kind: "diff"; stmt: DiffStmt };

src/sql/lexer.test.ts

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,4 +118,28 @@ describe("SQL Lexer", () => {
118118
expect(() => tokenize("SELECT `unclosed")).toThrow(SqlLexerError);
119119
expect(() => tokenize("SELECT `unclosed")).toThrow(/Unterminated backtick identifier/);
120120
});
121+
122+
it("tokenizes DDL keywords", () => {
123+
const tokens = tokenize("CREATE TABLE IF NOT EXISTS DROP ALTER INDEX VECTOR SHOW INDEXES");
124+
expect(tokens[0].type).toBe(TokenType.CREATE);
125+
expect(tokens[1].type).toBe(TokenType.TABLE);
126+
expect(tokens[2].type).toBe(TokenType.IF);
127+
expect(tokens[3].type).toBe(TokenType.NOT);
128+
expect(tokens[4].type).toBe(TokenType.EXISTS);
129+
expect(tokens[5].type).toBe(TokenType.DROP);
130+
expect(tokens[6].type).toBe(TokenType.ALTER);
131+
expect(tokens[7].type).toBe(TokenType.INDEX);
132+
expect(tokens[8].type).toBe(TokenType.VECTOR);
133+
expect(tokens[9].type).toBe(TokenType.SHOW);
134+
expect(tokens[10].type).toBe(TokenType.INDEXES);
135+
expect(tokens[11].type).toBe(TokenType.EOF);
136+
});
137+
138+
it("tokenizes ? as PARAMETER token", () => {
139+
const tokens = tokenize("SELECT * FROM t WHERE id = ? AND name = ?");
140+
// Tokens: SELECT * FROM t WHERE id = ? AND name = ? EOF
141+
// 0 1 2 3 4 5 6 7 8 9 10 11 12
142+
expect(tokens[7]).toMatchObject({ type: TokenType.PARAMETER, lexeme: "?" });
143+
expect(tokens[11]).toMatchObject({ type: TokenType.PARAMETER, lexeme: "?" });
144+
});
121145
});

src/sql/lexer.ts

Lines changed: 41 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,15 +6,24 @@ export enum TokenType {
66
IS, NULL, AS, DISTINCT, ORDER, BY, ASC, DESC, LIMIT, OFFSET,
77
GROUP, HAVING,
88

9+
// DDL keywords
10+
CREATE, DROP, ALTER, TABLE, INDEX, VECTOR, IF, SHOW, INDEXES,
11+
12+
// Time travel / diff keywords
13+
DIFF, VERSION, VERSIONS, CHANGES, SINCE, FOR, HEAD,
14+
915
// JOIN keywords
10-
JOIN, LEFT, RIGHT, INNER, OUTER, FULL, CROSS, ON,
16+
JOIN, LEFT, RIGHT, INNER, OUTER, FULL, CROSS, ON, NATURAL, USING,
1117

1218
// Set operations
1319
UNION, INTERSECT, EXCEPT, ALL,
1420

1521
// CASE
1622
CASE, WHEN, THEN, ELSE, END,
1723

24+
// Subquery keywords
25+
EXISTS,
26+
1827
// Type casting
1928
CAST,
2029

@@ -24,11 +33,20 @@ export enum TokenType {
2433
// Vector search
2534
NEAR, TOPK,
2635

36+
// Logic table extension
37+
WITH, DATA, LOGIC_TABLE, LOGIC,
38+
2739
// Aggregate functions (recognized as keywords for cleaner parsing)
2840
COUNT, SUM, AVG, MIN, MAX,
2941

30-
// Window function names
31-
ROW_NUMBER, RANK, DENSE_RANK, LAG, LEAD,
42+
// Ranking functions
43+
ROW_NUMBER, RANK, DENSE_RANK, NTILE, PERCENT_RANK, CUME_DIST,
44+
45+
// Offset/Analytic functions
46+
LAG, LEAD, FIRST_VALUE, LAST_VALUE, NTH_VALUE,
47+
48+
// Time window functions
49+
INTERVAL, SESSION, TUMBLE, HOP,
3250

3351
// Boolean
3452
TRUE, FALSE,
@@ -40,6 +58,7 @@ export enum TokenType {
4058
STAR, EQ, NE, LT, LE, GT, GE,
4159
PLUS, MINUS, SLASH, CONCAT,
4260
COMMA, DOT, LPAREN, RPAREN, SEMICOLON, LBRACKET, RBRACKET,
61+
PARAMETER,
4362

4463
EOF,
4564
}
@@ -58,23 +77,39 @@ const KEYWORDS = new Map<string, TokenType>([
5877
["DISTINCT", TokenType.DISTINCT], ["ORDER", TokenType.ORDER], ["BY", TokenType.BY],
5978
["ASC", TokenType.ASC], ["DESC", TokenType.DESC], ["LIMIT", TokenType.LIMIT],
6079
["OFFSET", TokenType.OFFSET], ["GROUP", TokenType.GROUP], ["HAVING", TokenType.HAVING],
80+
["CREATE", TokenType.CREATE], ["DROP", TokenType.DROP], ["ALTER", TokenType.ALTER],
81+
["TABLE", TokenType.TABLE], ["INDEX", TokenType.INDEX], ["VECTOR", TokenType.VECTOR],
82+
["IF", TokenType.IF], ["SHOW", TokenType.SHOW], ["INDEXES", TokenType.INDEXES],
83+
["DIFF", TokenType.DIFF], ["VERSION", TokenType.VERSION], ["VERSIONS", TokenType.VERSIONS],
84+
["CHANGES", TokenType.CHANGES], ["SINCE", TokenType.SINCE], ["FOR", TokenType.FOR],
85+
["HEAD", TokenType.HEAD],
6186
["JOIN", TokenType.JOIN], ["LEFT", TokenType.LEFT], ["RIGHT", TokenType.RIGHT],
6287
["INNER", TokenType.INNER], ["OUTER", TokenType.OUTER], ["FULL", TokenType.FULL],
63-
["CROSS", TokenType.CROSS], ["ON", TokenType.ON],
88+
["CROSS", TokenType.CROSS], ["ON", TokenType.ON], ["NATURAL", TokenType.NATURAL],
89+
["USING", TokenType.USING],
6490
["UNION", TokenType.UNION], ["INTERSECT", TokenType.INTERSECT],
6591
["EXCEPT", TokenType.EXCEPT], ["ALL", TokenType.ALL],
6692
["CASE", TokenType.CASE], ["WHEN", TokenType.WHEN], ["THEN", TokenType.THEN],
6793
["ELSE", TokenType.ELSE], ["END", TokenType.END],
94+
["EXISTS", TokenType.EXISTS],
6895
["CAST", TokenType.CAST],
6996
["OVER", TokenType.OVER], ["PARTITION", TokenType.PARTITION],
7097
["ROWS", TokenType.ROWS], ["RANGE", TokenType.RANGE],
7198
["UNBOUNDED", TokenType.UNBOUNDED], ["PRECEDING", TokenType.PRECEDING],
7299
["FOLLOWING", TokenType.FOLLOWING], ["CURRENT", TokenType.CURRENT],
73100
["NEAR", TokenType.NEAR], ["TOPK", TokenType.TOPK],
101+
["WITH", TokenType.WITH], ["DATA", TokenType.DATA],
102+
["LOGIC_TABLE", TokenType.LOGIC_TABLE], ["LOGIC", TokenType.LOGIC],
74103
["COUNT", TokenType.COUNT], ["SUM", TokenType.SUM], ["AVG", TokenType.AVG],
75104
["MIN", TokenType.MIN], ["MAX", TokenType.MAX],
76105
["ROW_NUMBER", TokenType.ROW_NUMBER], ["RANK", TokenType.RANK],
77-
["DENSE_RANK", TokenType.DENSE_RANK], ["LAG", TokenType.LAG], ["LEAD", TokenType.LEAD],
106+
["DENSE_RANK", TokenType.DENSE_RANK], ["NTILE", TokenType.NTILE],
107+
["PERCENT_RANK", TokenType.PERCENT_RANK], ["CUME_DIST", TokenType.CUME_DIST],
108+
["LAG", TokenType.LAG], ["LEAD", TokenType.LEAD],
109+
["FIRST_VALUE", TokenType.FIRST_VALUE], ["LAST_VALUE", TokenType.LAST_VALUE],
110+
["NTH_VALUE", TokenType.NTH_VALUE],
111+
["INTERVAL", TokenType.INTERVAL], ["SESSION", TokenType.SESSION],
112+
["TUMBLE", TokenType.TUMBLE], ["HOP", TokenType.HOP],
78113
["TRUE", TokenType.TRUE], ["FALSE", TokenType.FALSE],
79114
]);
80115

@@ -158,6 +193,7 @@ export function tokenize(sql: string): Token[] {
158193
case "/": tokens.push({ type: TokenType.SLASH, lexeme: "/", position: start }); break;
159194
case "[": tokens.push({ type: TokenType.LBRACKET, lexeme: "[", position: start }); break;
160195
case "]": tokens.push({ type: TokenType.RBRACKET, lexeme: "]", position: start }); break;
196+
case "?": tokens.push({ type: TokenType.PARAMETER, lexeme: "?", position: start }); break;
161197
case "=": tokens.push({ type: TokenType.EQ, lexeme: "=", position: start }); break;
162198
case "!":
163199
if (pos < len && sql[pos] === "=") {

src/sql/parser.test.ts

Lines changed: 135 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import { describe, it, expect } from "vitest";
2-
import { parse } from "./parser.js";
2+
import { parse, parseStatement } from "./parser.js";
33
import { SqlParseError } from "./parser.js";
44

55
describe("SQL Parser", () => {
@@ -251,4 +251,138 @@ describe("SQL Parser", () => {
251251
}
252252
}
253253
});
254+
255+
it("parses EXISTS subquery", () => {
256+
const stmt = parse("SELECT * FROM t WHERE EXISTS (SELECT id FROM u WHERE u.tid = t.id)");
257+
expect(stmt.where!.kind).toBe("exists");
258+
if (stmt.where!.kind === "exists") {
259+
expect(stmt.where!.negated).toBe(false);
260+
expect(stmt.where!.subquery.from).toEqual({ kind: "simple", name: "u", alias: undefined });
261+
}
262+
});
263+
264+
it("parses NOT EXISTS subquery", () => {
265+
const stmt = parse("SELECT * FROM t WHERE NOT EXISTS (SELECT id FROM u WHERE u.tid = t.id)");
266+
expect(stmt.where!.kind).toBe("exists");
267+
if (stmt.where!.kind === "exists") {
268+
expect(stmt.where!.negated).toBe(true);
269+
expect(stmt.where!.subquery.from).toEqual({ kind: "simple", name: "u", alias: undefined });
270+
}
271+
});
272+
273+
it("parses NATURAL JOIN", () => {
274+
const stmt = parse("SELECT * FROM a NATURAL JOIN b");
275+
expect(stmt.from.kind).toBe("join");
276+
if (stmt.from.kind === "join") {
277+
expect(stmt.from.join.natural).toBe(true);
278+
expect(stmt.from.join.joinType).toBe("inner");
279+
expect(stmt.from.join.table).toEqual({ kind: "simple", name: "b", alias: undefined });
280+
}
281+
});
282+
283+
it("parses NATURAL LEFT JOIN", () => {
284+
const stmt = parse("SELECT * FROM a NATURAL LEFT JOIN b");
285+
expect(stmt.from.kind).toBe("join");
286+
if (stmt.from.kind === "join") {
287+
expect(stmt.from.join.natural).toBe(true);
288+
expect(stmt.from.join.joinType).toBe("left");
289+
}
290+
});
291+
292+
it("parses JOIN with USING clause", () => {
293+
const stmt = parse("SELECT * FROM a JOIN b USING (id)");
294+
expect(stmt.from.kind).toBe("join");
295+
if (stmt.from.kind === "join") {
296+
expect(stmt.from.join.using).toEqual(["id"]);
297+
expect(stmt.from.join.onCondition).toBeUndefined();
298+
}
299+
});
300+
301+
it("parses JOIN with USING clause with multiple columns", () => {
302+
const stmt = parse("SELECT * FROM a JOIN b USING (id, name)");
303+
expect(stmt.from.kind).toBe("join");
304+
if (stmt.from.kind === "join") {
305+
expect(stmt.from.join.using).toEqual(["id", "name"]);
306+
}
307+
});
308+
309+
it("parses ? parameter binding", () => {
310+
const stmt = parse("SELECT * FROM t WHERE id = ?");
311+
expect(stmt.where!.kind).toBe("binary");
312+
if (stmt.where!.kind === "binary") {
313+
expect(stmt.where!.right.kind).toBe("parameter");
314+
if (stmt.where!.right.kind === "parameter") {
315+
expect(stmt.where!.right.index).toBe(0);
316+
}
317+
}
318+
});
319+
320+
it("parses multiple ? parameters with incrementing indices", () => {
321+
const stmt = parse("SELECT * FROM t WHERE a = ? AND b = ?");
322+
if (stmt.where!.kind === "binary" && stmt.where!.op === "and") {
323+
const left = stmt.where!.left;
324+
const right = stmt.where!.right;
325+
if (left.kind === "binary" && left.right.kind === "parameter") {
326+
expect(left.right.index).toBe(0);
327+
}
328+
if (right.kind === "binary" && right.right.kind === "parameter") {
329+
expect(right.right.index).toBe(1);
330+
}
331+
}
332+
});
333+
});
334+
335+
describe("SQL Statement Parser", () => {
336+
it("parses SHOW VERSIONS FOR table", () => {
337+
const result = parseStatement("SHOW VERSIONS FOR users");
338+
expect(result.kind).toBe("show_versions");
339+
if (result.kind === "show_versions") {
340+
expect(result.stmt.table).toBe("users");
341+
expect(result.stmt.limit).toBeUndefined();
342+
}
343+
});
344+
345+
it("parses SHOW VERSIONS FOR table LIMIT", () => {
346+
const result = parseStatement("SHOW VERSIONS FOR users LIMIT 10");
347+
if (result.kind === "show_versions") {
348+
expect(result.stmt.table).toBe("users");
349+
expect(result.stmt.limit).toBe(10);
350+
}
351+
});
352+
353+
it("parses DIFF table VERSION n", () => {
354+
const result = parseStatement("DIFF users VERSION 3");
355+
expect(result.kind).toBe("diff");
356+
if (result.kind === "diff") {
357+
expect(result.stmt.table).toBe("users");
358+
expect(result.stmt.fromVersion).toBe(3);
359+
expect(result.stmt.toVersion).toBeUndefined();
360+
}
361+
});
362+
363+
it("parses DIFF table VERSION n AND VERSION m", () => {
364+
const result = parseStatement("DIFF users VERSION 2 AND VERSION 5");
365+
if (result.kind === "diff") {
366+
expect(result.stmt.table).toBe("users");
367+
expect(result.stmt.fromVersion).toBe(2);
368+
expect(result.stmt.toVersion).toBe(5);
369+
}
370+
});
371+
372+
it("parses DIFF with LIMIT", () => {
373+
const result = parseStatement("DIFF users VERSION 1 AND VERSION 3 LIMIT 50");
374+
if (result.kind === "diff") {
375+
expect(result.stmt.fromVersion).toBe(1);
376+
expect(result.stmt.toVersion).toBe(3);
377+
expect(result.stmt.limit).toBe(50);
378+
}
379+
});
380+
381+
it("parses SELECT via parseStatement", () => {
382+
const result = parseStatement("SELECT * FROM t");
383+
expect(result.kind).toBe("select");
384+
if (result.kind === "select") {
385+
expect(result.stmt.from).toEqual({ kind: "simple", name: "t", alias: undefined });
386+
}
387+
});
254388
});

0 commit comments

Comments
 (0)