Skip to content

Commit f951a6d

Browse files
authored
Add parquet flatbuf schema
1 parent 5d56a53 commit f951a6d

File tree

1 file changed

+224
-0
lines changed

1 file changed

+224
-0
lines changed

src/main/flatbuf/parquet3.fbs

Lines changed: 224 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,224 @@
1+
namespace parquet.format3;
2+
3+
// Optimization notes
4+
// 1. Statistics are stored in integral types if their size is fixed, otherwise prefix + suffix
5+
// 2. ColumnMetaData.encoding_stats are removed, they are replaced with
6+
// ColumnMetaData.is_fully_dict_encoded.
7+
// 3. RowGroups are limited to 2GB in size, so we can use int for sizes.
8+
// 4. ColumnChunk/ColumnMetaData offsets are now relative to the start of the row group, so we can
9+
// use int for offsets.
10+
// 5. Remove ordinal.
11+
// 6. Restrict RowGroups to 2^31-1 rows.
12+
// 7. Remove offset/column indexes, they are small and just their offsets are of similar size.
13+
14+
///////////////////////////////////////////////////////////////////////////////////////////////////
15+
// Physical types.
16+
///////////////////////////////////////////////////////////////////////////////////////////////////
17+
18+
enum Type : byte {
19+
BOOLEAN = 0,
20+
INT32 = 1,
21+
INT64 = 2,
22+
INT96 = 3,
23+
FLOAT = 4,
24+
DOUBLE = 5,
25+
BYTE_ARRAY = 6,
26+
FIXED_LEN_BYTE_ARRAY = 7,
27+
}
28+
29+
enum FieldRepetitionType : byte {
30+
REQUIRED = 0,
31+
OPTIONAL = 1,
32+
REPEATED = 2,
33+
}
34+
35+
///////////////////////////////////////////////////////////////////////////////////////////////////
36+
// Encodings.
37+
///////////////////////////////////////////////////////////////////////////////////////////////////
38+
39+
// Note: Match the thrift enum values so that we can cast between them.
40+
enum Encoding : byte {
41+
PLAIN = 0,
42+
// GROUP_VAR_INT = 1,
43+
PLAIN_DICTIONARY = 2,
44+
RLE = 3,
45+
// BIT_PACKED = 4,
46+
DELTA_BINARY_PACKED = 5,
47+
DELTA_LENGTH_BYTE_ARRAY = 6,
48+
DELTA_BYTE_ARRAY = 7,
49+
RLE_DICTIONARY = 8,
50+
BYTE_STREAM_SPLIT = 9,
51+
}
52+
53+
// Note: Match the thrift enum values so that we can cast between them.
54+
enum CompressionCodec : byte {
55+
UNCOMPRESSED = 0,
56+
SNAPPY = 1,
57+
GZIP = 2,
58+
LZO = 3,
59+
BROTLI = 4,
60+
// LZ4 = 5,
61+
ZSTD = 6,
62+
LZ4_RAW = 7,
63+
}
64+
65+
///////////////////////////////////////////////////////////////////////////////////////////////////
66+
// Logical types.
67+
///////////////////////////////////////////////////////////////////////////////////////////////////
68+
69+
table Empty {}
70+
table DecimalOpts {
71+
precision: int;
72+
scale: int;
73+
}
74+
enum TimeUnit : byte {
75+
MS = 0,
76+
US = 1,
77+
NS = 2,
78+
}
79+
table TimeOpts {
80+
is_adjusted_to_utc: bool;
81+
unit: TimeUnit;
82+
}
83+
table IntOpts {
84+
bit_width: byte = 8;
85+
is_signed: bool;
86+
}
87+
table GeometryType {
88+
crs: string;
89+
}
90+
enum EdgeInterpolationAlgorithm : byte {
91+
SPHERICAL = 0,
92+
VINCENTY = 1,
93+
THOMAS = 2,
94+
ANDOYER = 3,
95+
KARNEY = 4,
96+
}
97+
table GeographyType {
98+
crs: string;
99+
algorithm: EdgeInterpolationAlgorithm;
100+
}
101+
union LogicalType {
102+
StringType:Empty,
103+
MapType:Empty,
104+
ListType:Empty,
105+
EnumType:Empty,
106+
DecimalType:DecimalOpts,
107+
DateType:Empty,
108+
TimeType:TimeOpts,
109+
TimestampType:TimeOpts,
110+
IntType:IntOpts,
111+
NullType:Empty,
112+
JsonType:Empty,
113+
BsonType:Empty,
114+
UUIDType:Empty,
115+
Float16Type:Empty,
116+
VariantType:Empty,
117+
GeometryType:GeometryType,
118+
GeographyType:GeographyType,
119+
}
120+
121+
table Statistics {
122+
null_count: int = null;
123+
// Store min/max values fixed sized entities depending on the physical type. If len is present
124+
// then the min/max value is present.
125+
//
126+
// - BOOLEAN: none
127+
// - INT32/FLOAT: lo4 (little-endian)
128+
// - INT64/DOUBLE: lo8 (little-endian)
129+
// - INT96: lo4+lo8 (little-endian)
130+
// - FIXED_LEN_BYTE_ARRAY:
131+
// - BYTE_ARRAY:
132+
// prefix: the longest common prefix of min/max
133+
// lo8+hi8 zero padded 16 bytes (big-endian) of the suffix
134+
// len: the length for the suffix of the value after removing the prefix. If > 16 then the
135+
// value is inexact
136+
min_lo4: uint;
137+
min_lo8: ulong;
138+
min_hi8: ulong;
139+
min_len: byte = null;
140+
max_lo4: uint;
141+
max_lo8: ulong;
142+
max_hi8: ulong;
143+
max_len: byte = null;
144+
prefix: string;
145+
}
146+
147+
union ColumnOrder {
148+
TypeDefinedOrder:Empty,
149+
}
150+
151+
table SchemaElement {
152+
name: string;
153+
type: Type = null;
154+
repetition_type: FieldRepetitionType;
155+
logical_type: LogicalType;
156+
type_length: int = null;
157+
num_children: int = 0;
158+
field_id: int = null;
159+
column_order: ColumnOrder; // only present for leaf nodes
160+
}
161+
162+
enum PageType : byte {
163+
DATA_PAGE = 0,
164+
INDEX_PAGE = 1,
165+
DICTIONARY_PAGE = 2,
166+
DATA_PAGE_V2 = 3,
167+
}
168+
169+
table KV {
170+
key: string;
171+
val: string;
172+
}
173+
174+
table ColumnMetadata {
175+
codec: CompressionCodec;
176+
num_values: long = null; // only present if not equal to rg.num_rows
177+
total_uncompressed_size: long;
178+
total_compressed_size: long;
179+
key_value_metadata: [KV];
180+
data_page_offset: long;
181+
index_page_offset: long = null;
182+
dictionary_page_offset: long = null;
183+
statistics: Statistics;
184+
is_fully_dict_encoded: bool;
185+
bloom_filter_offset: long = null;
186+
bloom_filter_length: int = null;
187+
}
188+
189+
table ColumnChunk {
190+
file_path: string;
191+
meta_data: ColumnMetadata;
192+
// crypto_metadata: ColumnCryptoMetadata; // TODO
193+
// encrypted_column_metadata: [byte]; // TODO
194+
}
195+
196+
table SortingColumn {
197+
column_idx: int;
198+
descending: bool;
199+
nulls_first: bool;
200+
}
201+
202+
table RowGroup {
203+
columns: [ColumnChunk];
204+
total_byte_size: long;
205+
num_rows: long;
206+
sorting_columns: [SortingColumn];
207+
file_offset: long;
208+
total_compressed_size: long;
209+
ordinal: short = null;
210+
}
211+
212+
table FileMetaData {
213+
version: int;
214+
schema: [SchemaElement];
215+
num_rows: long;
216+
row_groups: [RowGroup];
217+
kv: [KV];
218+
created_by: string;
219+
// column_orders: [ColumnOrder]; // moved to SchemaElement
220+
// encryption_algorithm: [EncryptionAlgorithm]; // TODO
221+
// footer_signing_key_metadata: binary; // TODO
222+
}
223+
224+
root_type FileMetaData;

0 commit comments

Comments
 (0)