@@ -33,7 +33,7 @@ enum Type {
3333 BOOLEAN = 0 ;
3434 INT32 = 1 ;
3535 INT64 = 2 ;
36- INT96 = 3 ; // deprecated, only used by legacy implementations.
36+ INT96 = 3 ; // deprecated, new Parquet writers should not write data in INT96
3737 FLOAT = 4 ;
3838 DOUBLE = 5 ;
3939 BYTE_ARRAY = 6 ;
@@ -1076,12 +1076,21 @@ union ColumnOrder {
10761076 * BOOLEAN - false, true
10771077 * INT32 - signed comparison
10781078 * INT64 - signed comparison
1079- * INT96 (only used for legacy timestamps) - undefined
1079+ * INT96 (only used for legacy timestamps) - undefined(+)
10801080 * FLOAT - signed comparison of the represented value (*)
10811081 * DOUBLE - signed comparison of the represented value (*)
10821082 * BYTE_ARRAY - unsigned byte-wise comparison
10831083 * FIXED_LEN_BYTE_ARRAY - unsigned byte-wise comparison
10841084 *
1085+ * (+) While the INT96 type has been deprecated, at the time of writing it is
1086+ * still used in many legacy systems. If a Parquet implementation chooses
1087+ * to write statistics for INT96 columns, it is recommended to order them
1088+ * according to the legacy rules:
1089+ * - compare the last 4 bytes (days) as a little-endian 32-bit signed integer
1090+ * - if equal last 4 bytes, compare the first 8 bytes as a little-endian
1091+ * 64-bit signed integer (nanos)
1092+ * See https://github.com/apache/parquet-format/issues/502 for more details
1093+ *
10851094 * (*) Because the sorting order is not specified properly for floating
10861095 * point values (relations vs. total ordering) the following
10871096 * compatibility rules should be applied when reading statistics:
0 commit comments