Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
49 commits
Select commit Hold shift + click to select a range
eee128c
Add a default FileStatisticsCache implementation for the ListingTable
mkleen Jan 18, 2026
4c541cf
fixup! Add a default FileStatisticsCache implementation for the Listi…
mkleen Jan 28, 2026
023ae44
Adapt memory usage when removing entries
mkleen Feb 4, 2026
22c5d88
Adapt heapsize for &str
mkleen Feb 4, 2026
0ebbc19
Fix formatting
mkleen Feb 4, 2026
af72631
Adapt heapsize for &str and add another scalarvalue
mkleen Feb 4, 2026
763cdeb
Add better error message
mkleen Feb 10, 2026
20d0067
Add todo to add heapsize for ordering in CachedFileMetadata
mkleen Feb 10, 2026
9ff6f6e
Fix comment/docs on DefaultFileStatisticsCache
mkleen Feb 10, 2026
1ff8cc0
Simplify test data generation
mkleen Feb 10, 2026
abf5b63
Remove potential stale entry, if entry is too large
mkleen Feb 10, 2026
fb08429
Fix typo in sql logic test comment
mkleen Feb 10, 2026
5104ea9
Fix comment about default behaviour in cache manager
mkleen Feb 10, 2026
d238f35
Fix variable name in test
mkleen Feb 10, 2026
8c2c05f
Fix variable name in test
mkleen Feb 10, 2026
985b669
Disable cache for sql logic test
mkleen Feb 10, 2026
bf52f91
Include key into memory estimation
mkleen Feb 11, 2026
696a08a
Fix fmt
mkleen Feb 11, 2026
5c4946e
Fix clippy
mkleen Feb 11, 2026
3b9143b
minor
mkleen Feb 11, 2026
496ba35
Add more key memory accounting
mkleen Feb 12, 2026
182e6a7
Fix Formatting
mkleen Feb 12, 2026
ffcf7ef
Account path as string and remove dependency to object_store
mkleen Feb 12, 2026
1a31c6c
Improve error handling
mkleen Feb 12, 2026
f004d1e
Fix fmt
mkleen Feb 12, 2026
afeae20
Remove path.clone
mkleen Feb 12, 2026
f714227
Simplify accounting for statistics
mkleen Feb 12, 2026
009fbdf
Adapt offset buffer
mkleen Feb 12, 2026
c614341
Fix heap size for Arc
mkleen Feb 12, 2026
803d7bc
Adapt estimate in test
mkleen Feb 12, 2026
87bf9f2
Fix sql logic test
mkleen Feb 12, 2026
d3e9f4c
Register cache from cachemanager at listing table
mkleen Apr 8, 2026
9ea7069
Revert slt
mkleen Apr 8, 2026
913fdde
Add tablescoping for file stats cache
mkleen Feb 18, 2026
3ba5689
Adapt slt
mkleen Apr 9, 2026
47ea46a
Fix linter
mkleen Apr 9, 2026
da8d248
Remove uneeded clone
mkleen Apr 9, 2026
e74254f
Rename cache_unit to file_statistics_cache
mkleen Apr 9, 2026
4100642
Simplify heap size accounting
mkleen Apr 9, 2026
ddbccf2
Adapt comments in test
mkleen Apr 10, 2026
45c7157
Seperate drop table clean-ups
mkleen Apr 10, 2026
cbd9b1e
fixup! Seperate drop table clean-ups
mkleen Apr 10, 2026
8e45765
Increase default limit to 10 mb
mkleen Apr 15, 2026
9b48bd8
Increase default limit to 20 mb
mkleen Apr 15, 2026
66d433b
Fix comment
mkleen Apr 15, 2026
720bba0
Fix deregister logic
mkleen Apr 15, 2026
10ade1f
Fix slt
mkleen Apr 15, 2026
269ae9d
Add table reference to FileStatisticsCacheEntry
mkleen Apr 15, 2026
3b64005
fixup! Add table reference to FileStatisticsCacheEntry
mkleen Apr 15, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 1 addition & 57 deletions datafusion-cli/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -443,10 +443,7 @@ mod tests {
use super::*;
use datafusion::{
common::test_util::batches_to_string,
execution::cache::{
DefaultListFilesCache, cache_manager::CacheManagerConfig,
cache_unit::DefaultFileStatisticsCache,
},
execution::cache::{DefaultListFilesCache, cache_manager::CacheManagerConfig},
prelude::{ParquetReadOptions, col, lit, split_part},
};
use insta::assert_snapshot;
Expand Down Expand Up @@ -656,8 +653,6 @@ mod tests {
Ok(())
}

/// Shows that the statistics cache is not enabled by default yet
/// See https://github.com/apache/datafusion/issues/19217
#[tokio::test]
async fn test_statistics_cache_default() -> Result<(), DataFusionError> {
let ctx = SessionContext::new();
Expand Down Expand Up @@ -687,57 +682,6 @@ mod tests {
.await?;
}

// When the cache manager creates a StatisticsCache by default,
// the contents will show up here
let sql = "SELECT split_part(path, '/', -1) as filename, file_size_bytes, num_rows, num_columns, table_size_bytes from statistics_cache() order by filename";
let df = ctx.sql(sql).await?;
let rbs = df.collect().await?;
assert_snapshot!(batches_to_string(&rbs),@r"
++
++
");

Ok(())
}

// Can be removed when https://github.com/apache/datafusion/issues/19217 is resolved
#[tokio::test]
async fn test_statistics_cache_override() -> Result<(), DataFusionError> {
// Install a specific StatisticsCache implementation
let file_statistics_cache = Arc::new(DefaultFileStatisticsCache::default());
let cache_config = CacheManagerConfig::default()
.with_files_statistics_cache(Some(file_statistics_cache.clone()));
let runtime = RuntimeEnvBuilder::new()
.with_cache_manager(cache_config)
.build()?;
let config = SessionConfig::new().with_collect_statistics(true);
let ctx = SessionContext::new_with_config_rt(config, Arc::new(runtime));

ctx.register_udtf(
"statistics_cache",
Arc::new(StatisticsCacheFunc::new(
ctx.task_ctx().runtime_env().cache_manager.clone(),
)),
);

for filename in [
"alltypes_plain",
"alltypes_tiny_pages",
"lz4_raw_compressed_larger",
] {
ctx.sql(
format!(
"create external table {filename}
stored as parquet
location '../parquet-testing/data/{filename}.parquet'",
)
.as_str(),
)
.await?
.collect()
.await?;
}

let sql = "SELECT split_part(path, '/', -1) as filename, file_size_bytes, num_rows, num_columns, table_size_bytes from statistics_cache() order by filename";
let df = ctx.sql(sql).await?;
let rbs = df.collect().await?;
Expand Down
2 changes: 1 addition & 1 deletion datafusion/catalog-listing/src/helpers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -359,7 +359,7 @@ fn try_into_partitioned_file(

let mut pf: PartitionedFile = object_meta.into();
pf.partition_values = partition_values;

pf.table_reference.clone_from(table_path.get_table_ref());
Ok(pf)
}

Expand Down
35 changes: 19 additions & 16 deletions datafusion/catalog-listing/src/table.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@ use datafusion_datasource::{
};
use datafusion_execution::cache::TableScopedPath;
use datafusion_execution::cache::cache_manager::FileStatisticsCache;
use datafusion_execution::cache::cache_unit::DefaultFileStatisticsCache;
use datafusion_expr::dml::InsertOp;
use datafusion_expr::execution_props::ExecutionProps;
use datafusion_expr::{Expr, TableProviderFilterPushDown, TableType};
Expand Down Expand Up @@ -187,7 +186,7 @@ pub struct ListingTable {
/// The SQL definition for this table, if any
definition: Option<String>,
/// Cache for collected file statistics
collected_statistics: Arc<dyn FileStatisticsCache>,
collected_statistics: Option<Arc<dyn FileStatisticsCache>>,
/// Constraints applied to this table
constraints: Constraints,
/// Column default expressions for columns that are not physically present in the data files
Expand Down Expand Up @@ -231,7 +230,7 @@ impl ListingTable {
schema_source,
options,
definition: None,
collected_statistics: Arc::new(DefaultFileStatisticsCache::default()),
collected_statistics: None,
constraints: Constraints::default(),
column_defaults: HashMap::new(),
expr_adapter_factory: config.expr_adapter_factory,
Expand Down Expand Up @@ -260,10 +259,8 @@ impl ListingTable {
/// Setting a statistics cache on the `SessionContext` can avoid refetching statistics
/// multiple times in the same session.
///
/// If `None`, creates a new [`DefaultFileStatisticsCache`] scoped to this query.
pub fn with_cache(mut self, cache: Option<Arc<dyn FileStatisticsCache>>) -> Self {
self.collected_statistics =
cache.unwrap_or_else(|| Arc::new(DefaultFileStatisticsCache::default()));
self.collected_statistics = cache;
self
}

Expand Down Expand Up @@ -802,11 +799,15 @@ impl ListingTable {
) -> datafusion_common::Result<(Arc<Statistics>, Option<LexOrdering>)> {
use datafusion_execution::cache::cache_manager::CachedFileMetadata;

let path = &part_file.object_meta.location;
let path = TableScopedPath {
table: part_file.table_reference.clone(),
path: part_file.object_meta.location.clone(),
};
let meta = &part_file.object_meta;

// Check cache first - if we have valid cached statistics and ordering
if let Some(cached) = self.collected_statistics.get(path)
if let Some(cache) = &self.collected_statistics
&& let Some(cached) = cache.get(&path)
&& cached.is_valid_for(meta)
{
// Return cached statistics and ordering
Expand All @@ -823,14 +824,16 @@ impl ListingTable {
let statistics = Arc::new(file_meta.statistics);

// Store in cache
self.collected_statistics.put(
path,
CachedFileMetadata::new(
meta.clone(),
Arc::clone(&statistics),
file_meta.ordering.clone(),
),
);
if let Some(cache) = &self.collected_statistics {
cache.put(
&path,
CachedFileMetadata::new(
meta.clone(),
Arc::clone(&statistics),
file_meta.ordering.clone(),
),
);
}

Ok((statistics, file_meta.ordering))
}
Expand Down
Loading
Loading