-
Notifications
You must be signed in to change notification settings - Fork 17
Expand file tree
/
Copy pathexec_delta_housekeeping.py
More file actions
65 lines (43 loc) · 1.79 KB
/
exec_delta_housekeeping.py
File metadata and controls
65 lines (43 loc) · 1.79 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# Databricks notebook source
# MAGIC %md
# MAGIC # Run Delta Housekeeping across multiple tables
# MAGIC Analysis that provides stats on Delta tables / recommendations for improvements, including:
# MAGIC - stats:size of tables and number of files, timestamps of latest OPTIMIZE & VACUUM operations, stats of OPTIMIZE)
# MAGIC - recommendations on tables that need to be OPTIMIZED/VACUUM'ed
# MAGIC - are tables OPTIMIZED/VACUUM'ed often enough
# MAGIC - tables that have small files / tables for which ZORDER is not being effective
# MAGIC
# COMMAND ----------
# MAGIC %pip install dbl-discoverx
# COMMAND ----------
# MAGIC %md
# MAGIC ### Declare Variables
# COMMAND ----------
dbutils.widgets.text("catalogs", "*", "Catalogs")
dbutils.widgets.text("schemas", "*", "Schemas")
dbutils.widgets.text("tables", "*", "Tables")
# COMMAND ----------
catalogs = dbutils.widgets.get("catalogs")
schemas = dbutils.widgets.get("schemas")
tables = dbutils.widgets.get("tables")
from_table_statement = ".".join([catalogs, schemas, tables])
# COMMAND ----------
from discoverx import DX
dx = DX()
# COMMAND ----------
# DBTITLE 1,Run the discoverx DeltaHousekeeping operation -generates an output object on which you can run operations
output = (
dx.from_tables(from_table_statement)
.delta_housekeeping()
)
# COMMAND ----------
# DBTITLE 1,apply() operation generates a spark dataframe with recommendations
result = output.apply()
result.select("catalog", "database", "tableName", *[c for c in result.columns if c.startswith("rec_")]).display()
# COMMAND ----------
# DBTITLE 1,display() runs apply and displays the full result (including stats per table)
output.display()
# COMMAND ----------
# DBTITLE 1,explain() outputs the DeltaHousekeeping recommendations in HTML format
output.explain()
# COMMAND ----------