fix: correct DictEncoder::estimated_memory_size

mzabaluev · mzabaluev · commit 19002abc3522 · 2026-04-14T20:20:19.000+03:00
The returned value should estimate the actual memory usage, but
instead it used the evaluation of the encoded size of the dictionary
data, and bypassed the hash table memory usage added by the Interner.
The implementation of Storage::estimated_memory_size for the
unique key storage was not correct as well, but it was unused.
Correct both problems.
diff --git a/parquet/src/encodings/encoding/dict_encoder.rs b/parquet/src/encodings/encoding/dict_encoder.rs
@@ -64,7 +64,7 @@ impl<T: DataType> Storage for KeyStorage<T> {
     }
 
     fn estimated_memory_size(&self) -> usize {
-        self.size_in_bytes + self.uniques.capacity() * std::mem::size_of::<T::T>()
+        self.uniques.capacity() * std::mem::size_of::<T::T>()
     }
 }
 
@@ -183,6 +183,6 @@ impl<T: DataType> Encoder<T> for DictEncoder<T> {
     ///
     /// For this encoder, the indices are unencoded bytes (refer to [`Self::write_indices`]).
     fn estimated_memory_size(&self) -> usize {
-        self.interner.storage().size_in_bytes + self.indices.len() * std::mem::size_of::<usize>()
+        self.interner.estimated_memory_size() + self.indices.len() * std::mem::size_of::<usize>()
     }
 }

Original file line number	Diff line number	Diff line change
`@@ -64,7 +64,7 @@ impl<T: DataType> Storage for KeyStorage<T> {`
`64`	`64`	`}`
`65`	`65`
`66`	`66`	`fn estimated_memory_size(&self) -> usize {`
`67`		`- self.size_in_bytes + self.uniques.capacity() * std::mem::size_of::<T::T>()`
	`67`	`+ self.uniques.capacity() * std::mem::size_of::<T::T>()`
`68`	`68`	`}`
`69`	`69`	`}`
`70`	`70`
`@@ -183,6 +183,6 @@ impl<T: DataType> Encoder<T> for DictEncoder<T> {`
`183`	`183`	`///`
`184`	`184`	/// For this encoder, the indices are unencoded bytes (refer to [`Self::write_indices`]).
`185`	`185`	`fn estimated_memory_size(&self) -> usize {`
`186`		`- self.interner.storage().size_in_bytes + self.indices.len() * std::mem::size_of::<usize>()`
	`186`	`+ self.interner.estimated_memory_size() + self.indices.len() * std::mem::size_of::<usize>()`
`187`	`187`	`}`
`188`	`188`	`}`