-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathreporting.py
More file actions
344 lines (283 loc) · 12.5 KB
/
reporting.py
File metadata and controls
344 lines (283 loc) · 12.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
"""Environment reporting utilities for Fiddler.
This module provides EnvironmentReporter, a high-level facade for environment
analysis and reporting that simplifies the env_stats notebook workflow.
"""
from typing import Optional, List
import logging
from .projects import (
ProjectManager,
EnvironmentHierarchy,
EnvironmentStats,
TimestampAnalysis,
)
logger = logging.getLogger(__name__)
class EnvironmentReporter:
"""High-level facade for environment analysis and reporting.
Combines ProjectManager discovery with formatted output, export, and
analysis - essentially the env_stats notebook as a reusable class.
This class provides a simple interface for common environment analysis tasks:
1. analyze_environment() - collect data
2. generate_report() - display formatted output
3. export_to_csv() - export results
Example:
```python
from fiddler_utils import get_or_init, EnvironmentReporter
get_or_init(url=URL, token=TOKEN, log_level='ERROR')
# Run complete analysis
reporter = EnvironmentReporter()
reporter.analyze_environment(
include_features=True,
include_timestamps=True
)
# Display formatted report
reporter.generate_report(top_n=15)
# Export to CSV
files = reporter.export_to_csv(prefix='env_stats')
print(f"Exported: {files}")
```
"""
def __init__(self, project_manager: Optional[ProjectManager] = None):
"""Initialize EnvironmentReporter.
Args:
project_manager: Optional ProjectManager instance (creates new if None)
"""
self.project_mgr = project_manager or ProjectManager()
self._hierarchy: Optional[EnvironmentHierarchy] = None
self._stats: Optional[EnvironmentStats] = None
self._timestamp_analysis: Optional[TimestampAnalysis] = None
def analyze_environment(
self,
include_features: bool = True,
include_timestamps: bool = True,
include_assets: bool = False
):
"""Run complete environment analysis (data collection phase).
This is the core data collection step that traverses all projects,
models, and features in the environment.
Args:
include_features: Extract feature lists from model specs
include_timestamps: Include created_at/updated_at timestamps
include_assets: Include counts of segments, metrics, alerts (slower)
Example:
```python
reporter = EnvironmentReporter()
reporter.analyze_environment(
include_features=True,
include_timestamps=True
)
```
"""
logger.info('Starting environment analysis')
# Collect hierarchy
self._hierarchy = self.project_mgr.get_environment_hierarchy(
include_features=include_features,
include_timestamps=include_timestamps,
include_assets=include_assets
)
# Calculate statistics
self._stats = self.project_mgr.get_environment_statistics(
hierarchy=self._hierarchy
)
# Analyze timestamps if included
if include_timestamps:
self._timestamp_analysis = self.project_mgr.get_timestamp_analysis(
hierarchy=self._hierarchy
)
logger.info(f'Environment analysis complete: {self._stats.total_projects} projects, '
f'{self._stats.total_models} models, {self._stats.total_features} features')
def generate_report(
self,
show_projects: bool = True,
show_models: bool = True,
show_timestamps: bool = True,
show_newest_oldest: bool = True,
top_n: int = 15
):
"""Generate and print formatted report to console.
Displays comprehensive environment analysis including:
- Overall statistics (projects, models, features)
- Distribution metrics (mean, median, min, max)
- Top projects by model count
- Top models by feature count
- Timestamp analysis (if available)
- Newest and oldest models (if available)
Args:
show_projects: Show project breakdown
show_models: Show model analysis
show_timestamps: Show timestamp analysis
show_newest_oldest: Show newest/oldest models
top_n: Number of items to show in top lists
Raises:
RuntimeError: If analyze_environment() has not been called yet
Example:
```python
reporter.generate_report(
show_projects=True,
show_models=True,
show_timestamps=True,
top_n=10
)
```
"""
if self._hierarchy is None or self._stats is None:
raise RuntimeError(
'analyze_environment() must be called before generate_report(). '
'Call reporter.analyze_environment() first.'
)
# Overall summary
print("\n" + "="*70)
print("FIDDLER ENVIRONMENT ANALYSIS")
print("="*70)
print(f"\n📊 Overall Statistics:")
print(f" Total Projects: {self._stats.total_projects}")
print(f" Total Models: {self._stats.total_models}")
print(f" Total Features: {self._stats.total_features}")
# Models per project
if show_projects and self._stats.total_projects > 0:
print(f"\n📁 Models per Project:")
print(f" Mean: {self._stats.models_per_project_mean:.1f}")
print(f" Median: {self._stats.models_per_project_median:.1f}")
print(f" Range: {self._stats.models_per_project_min} - {self._stats.models_per_project_max}")
if self._stats.top_projects_by_models:
print(f"\n🏆 Top {top_n} Projects by Model Count:")
for i, (name, count) in enumerate(self._stats.top_projects_by_models[:top_n], 1):
bar = "█" * min(count, 50)
print(f" {i:2d}. {name:35s} {count:4d} │{bar}")
# Features per model
if show_models and self._stats.total_models > 0:
print(f"\n🔧 Features per Model:")
print(f" Mean: {self._stats.features_per_model_mean:.1f}")
print(f" Median: {self._stats.features_per_model_median:.1f}")
print(f" Range: {self._stats.features_per_model_min} - {self._stats.features_per_model_max}")
if self._stats.top_models_by_features:
print(f"\n🏆 Top {top_n} Models by Feature Count:")
for i, (proj, model, count) in enumerate(self._stats.top_models_by_features[:top_n], 1):
model_display = f"{proj}/{model}"
bar = "█" * min(int(count/5), 50)
print(f" {i:2d}. {model_display:45s} {count:4d} │{bar}")
# Timestamp analysis
if show_timestamps and self._timestamp_analysis is not None:
ts = self._timestamp_analysis
print(f"\n📅 Timestamp Analysis:")
print(f" Models with timestamps: {ts.models_with_timestamps} "
f"({ts.timestamp_coverage_pct:.1f}% coverage)")
if ts.earliest_created and ts.latest_created:
print(f" Earliest created: {ts.earliest_created.strftime('%Y-%m-%d')}")
print(f" Latest created: {ts.latest_created.strftime('%Y-%m-%d')}")
if ts.most_recent_update:
print(f" Most recent update: {ts.most_recent_update.strftime('%Y-%m-%d')}")
if ts.avg_days_between_create_update is not None:
print(f" Avg days between create/update: {ts.avg_days_between_create_update:.1f}")
# Newest models
if show_newest_oldest and ts.newest_models:
print(f"\n🆕 Newest Models (Top {min(10, len(ts.newest_models))}):")
for i, model in enumerate(ts.newest_models[:10], 1):
created = model.created_at.strftime('%Y-%m-%d') if model.created_at else 'N/A'
print(f" {i:2d}. {model.name:40s} {created}")
# Oldest models
if show_newest_oldest and ts.oldest_models:
print(f"\n👴 Oldest Models (Top {min(10, len(ts.oldest_models))}):")
for i, model in enumerate(ts.oldest_models[:10], 1):
created = model.created_at.strftime('%Y-%m-%d') if model.created_at else 'N/A'
print(f" {i:2d}. {model.name:40s} {created}")
# Most recently updated models
if show_newest_oldest and ts.most_recently_updated_models:
print(f"\n🔄 Most Recently Updated Models (Top {min(5, len(ts.most_recently_updated_models))}):")
for i, model in enumerate(ts.most_recently_updated_models[:5], 1):
updated = model.updated_at.strftime('%Y-%m-%d') if model.updated_at else 'N/A'
print(f" {i:2d}. {model.name:40s} {updated}")
print("\n" + "="*70)
print(f"✓ Analysis complete")
print("="*70 + "\n")
def export_to_csv(
self,
output_dir: str = '.',
prefix: str = 'env_stats'
) -> List[str]:
"""Export analysis results to CSV files.
Creates multiple CSV files:
- {prefix}__overview.csv (model-level data)
- {prefix}__flattened_hierarchy.csv (feature-level data)
Args:
output_dir: Output directory path
prefix: Filename prefix
Returns:
List of created file paths
Raises:
RuntimeError: If analyze_environment() has not been called yet
Example:
```python
files = reporter.export_to_csv(
output_dir='exports',
prefix='fiddler_env'
)
print(f"Created {len(files)} files: {files}")
```
"""
if self._hierarchy is None:
raise RuntimeError(
'analyze_environment() must be called before export_to_csv(). '
'Call reporter.analyze_environment() first.'
)
files = self.project_mgr.export_environment_to_csv(
output_dir=output_dir,
prefix=prefix
)
logger.info(f'Exported {len(files)} CSV files')
return files
def get_hierarchy(self) -> EnvironmentHierarchy:
"""Get raw hierarchy data for custom analysis.
Returns:
EnvironmentHierarchy dataclass
Raises:
RuntimeError: If analyze_environment() has not been called yet
Example:
```python
hierarchy = reporter.get_hierarchy()
for project_id, project in hierarchy.projects.items():
print(f"{project.name}: {project.model_count} models")
```
"""
if self._hierarchy is None:
raise RuntimeError(
'analyze_environment() must be called first. '
'Call reporter.analyze_environment() to collect data.'
)
return self._hierarchy
def get_statistics(self) -> EnvironmentStats:
"""Get statistics object for custom analysis.
Returns:
EnvironmentStats dataclass
Raises:
RuntimeError: If analyze_environment() has not been called yet
Example:
```python
stats = reporter.get_statistics()
print(f"Average features per model: {stats.features_per_model_mean}")
```
"""
if self._stats is None:
raise RuntimeError(
'analyze_environment() must be called first. '
'Call reporter.analyze_environment() to collect data.'
)
return self._stats
def get_timestamp_analysis(self) -> Optional[TimestampAnalysis]:
"""Get timestamp analysis object for custom analysis.
Returns:
TimestampAnalysis dataclass, or None if timestamps not collected
Raises:
RuntimeError: If analyze_environment() has not been called yet
Example:
```python
ts_analysis = reporter.get_timestamp_analysis()
if ts_analysis:
print(f"Coverage: {ts_analysis.timestamp_coverage_pct}%")
```
"""
if self._hierarchy is None:
raise RuntimeError(
'analyze_environment() must be called first. '
'Call reporter.analyze_environment() to collect data.'
)
return self._timestamp_analysis