@@ -309,11 +309,11 @@ struct Statistics {
309309 7: optional bool is_max_value_exact ;
310310 /** If true, min_value is the actual minimum value for a column */
311311 8: optional bool is_min_value_exact ;
312- /**
313- * count of NaN values in the column; only present if physical type is FLOAT
314- * or DOUBLE, or logical type is FLOAT16.
315- * Readers MUST distinguish between nan_count not being present and nan_count == 0.
316- * If nan_count is not present, readers MUST NOT assume nan_count == 0.
312+ /**
313+ * Count of NaN values in the column; only present if physical type is FLOAT
314+ * or DOUBLE, or logical type is FLOAT16.
315+ * If this field is not present, readers must assume NaNs may be present
316+ * ( MUST NOT assume nan_count == 0) .
317317 */
318318 9: optional i64 nan_count ;
319319}
@@ -677,7 +677,7 @@ enum BoundaryOrder {
677677/** Data page header */
678678struct DataPageHeader {
679679 /**
680- * Number of values, including nulls , in this data page.
680+ * Number of values, including NULLs , in this data page.
681681 *
682682 * If a OffsetIndex is present, a page must begin at a row
683683 * boundary (repetition_level = 0). Otherwise, pages may begin
@@ -729,9 +729,9 @@ struct DictionaryPageHeader {
729729 * regardless of which page header is used.
730730 **/
731731struct DataPageHeaderV2 {
732- /** Number of values, including nulls , in this data page. **/
732+ /** Number of values, including NULLs , in this data page. **/
733733 1: required i32 num_values
734- /** Number of null values, in this data page.
734+ /** Number of NULL values, in this data page.
735735 Number of non-null = num_values - num_nulls which is also the number of values in the data section **/
736736 2: required i32 num_nulls
737737 /**
@@ -1122,10 +1122,9 @@ union ColumnOrder {
11221122 * 64-bit signed integer (nanos)
11231123 * See https://github.com/apache/parquet-format/issues/502 for more details
11241124 *
1125- * (*) Because the precise sorting order is ambiguous for floating
1126- * point types due to underspecified handling of NaN and -0/+0,
1127- * it is recommended that writers use IEEE_754_TOTAL_ORDER
1128- * for these types.
1125+ * (*) Because TYPE_ORDER is ambiguous for floating point types due to
1126+ * underspecified handling of NaN and -0/+0, it is recommended that writers
1127+ * use IEEE_754_TOTAL_ORDER for these types.
11291128 *
11301129 * If TYPE_ORDER is used for floating point types, then the following
11311130 * compatibility rules should be applied when reading statistics:
@@ -1134,20 +1133,19 @@ union ColumnOrder {
11341133 * - If the nan_count field is set, a reader can compute
11351134 * nan_count + null_count == num_values to deduce whether all non-null
11361135 * values are NaN.
1137- * - When looking for NaN values, min and max should be ignored.
1138- * If the nan_count field is set, it can be used to check whether
1139- * NaNs are present.
11401136 * - If the min is +0, the row group may contain -0 values as well.
11411137 * - If the max is -0, the row group may contain +0 values as well.
11421138 * - When looking for NaN values, min and max should be ignored.
1139+ * If the nan_count field is set, it can be used to check whether
1140+ * NaNs are present.
11431141 *
11441142 * When writing statistics the following rules should be followed:
1145- * - It is suggested to always set the nan_count field for floating
1146- * point types, especially also if it is zero.
1143+ * - Always set the nan_count field for floating point types, especially
1144+ * even if it is zero.
11471145 * - NaNs should not be written to min or max statistics fields except
1148- * in the column index, where min_values and max_values are not optional
1149- * so a NaN value must be written if all non-null values in a page
1150- * are NaN .
1146+ * in the column index when a page contains only NaN values. In this
1147+ * case, since min_values and max_values are required, a NaN value
1148+ * must be written .
11511149 * - If the computed max value is zero (whether negative or positive),
11521150 * `+0.0` should be written into the max statistics field.
11531151 * - If the computed min value is zero (whether negative or positive),
@@ -1202,14 +1200,10 @@ union ColumnOrder {
12021200 *
12031201 * When writing statistics for columns with this order, the following rules
12041202 * must be followed:
1205- * - Writing the nan_count field is mandatory when using this ordering,
1206- * especialy also if it is zero.
1207- * - NaNs should not be written to min or max statistics fields except
1208- * in the column index, where min_values and max_values are not optional
1209- * so a NaN value must be written if all non-null values in a page
1210- * are NaN. In this case, the min_values[i] and max_values[i] fields
1211- * should be set to the smallest and largest NaN values contained
1212- * in the page, as defined by the IEEE 754 total order.
1203+ * - Writing the nan_count field is mandatory when using this ordering.
1204+ * - Min and max statistics must contain the smallest and largest non-NaN
1205+ * values respectively, or if all non-null values are NaN, the smallest and
1206+ * largest NaN values as defined by IEEE 754 total order.
12131207 *
12141208 * When reading statistics for columns with this order, the following rules
12151209 * should be followed:
@@ -1292,19 +1286,17 @@ struct ColumnIndex {
12921286 * Such more compact values must still be valid values within the column's
12931287 * logical type. Readers must make sure that list entries are populated before
12941288 * using them by inspecting null_pages.
1289+ *
12951290 * For columns of physical type FLOAT or DOUBLE, or logical type FLOAT16,
12961291 * NaN values are not to be included in these bounds. If all non-null values
12971292 * of a page are NaN, then a writer must do the following:
1298- * - If the order of this column is TypeDefinedOrder , then no column index
1293+ * - If the order of this column is TYPE_ORDER , then no column index
12991294 * must be written for this column chunk. While this is unfortunate for
13001295 * performance, it is necessary to avoid conflict with legacy files that
13011296 * still included NaN in min_values and max_values even if the page had
13021297 * non-NaN values. To mitigate this, IEEE754_TOTAL_ORDER is recommended.
13031298 * - If the order of this column is IEEE754_TOTAL_ORDER, then min_values[i]
1304- * * If IEEE754_TOTAL_ORDER is used for the column and all non-null values
1305- * of a page are NaN, then min_values[i] and max_values[i] must be set to
1306- * the smallest and largest NaN value contained in the page, as defined
1307- * by the IEEE 754 total order.
1299+ * and max_values[i] of that page must be set to a standard NaN value.
13081300 */
13091301 2: required list<binary> min_values
13101302 3: required list<binary> max_values
@@ -1328,6 +1320,7 @@ struct ColumnIndex {
13281320 * null counts are 0.
13291321 */
13301322 5: optional list<i64> null_counts
1323+
13311324 /**
13321325 * Contains repetition level histograms for each page
13331326 * concatenated together. The repetition_level_histogram field on
@@ -1346,12 +1339,11 @@ struct ColumnIndex {
13461339 **/
13471340 7: optional list<i64> definition_level_histograms ;
13481341
1349- /**
1342+ /**
13501343 * A list containing the number of NaN values for each page. Only present
13511344 * for columns of physical type FLOAT or DOUBLE, or logical type FLOAT16.
1352- * If this field is not present, readers MUST assume that there might or
1353- * might not be NaN values in any page, as NaNs should not be included
1354- * in min_values or max_values.
1345+ * If this field is not present, readers MUST assume that there might be
1346+ * NaN values in any page.
13551347 */
13561348 8: optional list<i64> nan_counts
13571349
0 commit comments