forked from NVIDIA/NeMo-Agent-Toolkit
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_models.py
More file actions
188 lines (150 loc) · 6.03 KB
/
data_models.py
File metadata and controls
188 lines (150 loc) · 6.03 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import typing
from pathlib import Path
from pydantic import BaseModel
from pydantic import Field
class FitConfig(BaseModel):
"""
Configuration parameters for linear fit and outlier detection.
"""
# Threshold for small concurrency range (≤ 8 points) to check for extreme outliers in raw y-values first
small_concurrency_range_threshold: int = 8
# Extreme outlier threshold is 2.0 times the IQR, extreme outliers are removed
extreme_outlier_threshold: float = 2.0
# Conservative outlier threshold is 1.5 times the IQR, conservative outliers are removed
conservative_outlier_threshold: float = 1.5
# Minimum R-squared value required for a valid linear fit
min_r_squared: float = 0.7
# Whether to remove outliers during linear fit calculation
remove_outliers: bool = True
class CalcRunnerConfig(BaseModel):
"""
Parameters used for a calc runner.
"""
# base config and endpoints (if remote)- not needed in offline mode
config_file: Path | None = None
# endpoint to use for the workflow, if not provided the workflow is run locally
endpoint: str | None = None
# timeout for the workflow
endpoint_timeout: int = 300
# if true workflow is not run, instead results from previous runs are used to estimate the
# GPU count
offline_mode: bool = False
# number of passes at each concurrency, if 0 the dataset is adjusted to a multiple of the
# concurrency
num_passes: int = 0
# concurrency values to test
concurrencies: list[int] = [1, 2, 4, 8]
# Targets for GPU estimation
target_llm_latency_p95: float = 0
target_workflow_runtime_p95: float = 0
target_users: int = 0
# Test setup information needed for GPU estimation
test_gpu_count: int = 0
# output directory for results
output_dir: Path | None = None
# if true, the job is stored in a new subdirectory of the output directory
append_job: bool = False
# if true, the data is plotted
plot_data: bool = True
# Configuration for linear fit and outlier detection
fit_config: FitConfig = Field(default_factory=FitConfig)
# Sizing metrics are gathered from the evaluation runs and used as input by the calculator.
class SizingMetricPerItem(BaseModel):
"""
Sizing metrics per dataset entry item.
"""
# LLM latency
llm_latency: float
# workflow runtime
workflow_runtime: float
class SizingMetricsAlerts(BaseModel):
"""
Sizing metrics alerts.
"""
# if true, the workflow was interrupted that concurrency cannot be used
workflow_interrupted: bool = False
class SizingMetrics(BaseModel):
"""
Sizing metrics for a single concurrency.
"""
# alerts associated with the sizing metrics
alerts: SizingMetricsAlerts = Field(default_factory=SizingMetricsAlerts)
# p95 LLM latency
llm_latency_p95: float = 0.0
# p95 workflow runtime
workflow_runtime_p95: float = 0.0
# total workflow runtime
total_runtime: float = 0.0
# per item metrics, key is the dataset entry id
per_item_metrics: dict[typing.Any, SizingMetricPerItem] = {}
class LinearFitResult(BaseModel):
"""
Result of linear regression including slope, intercept, and quality metrics.
"""
slope: float
intercept: float
r_squared: float
outliers_removed: list[int]
class FitResults(BaseModel):
"""
Linear fit results for both LLM latency and workflow runtime analysis.
"""
llm_latency_fit: LinearFitResult | None = None
wf_runtime_fit: LinearFitResult | None = None
# GPU estimates are generated by the calculator.
class GPUEstimates(BaseModel):
"""
GPU estimates.
"""
# GPU estimate based on the workflow runtime
gpu_estimate_by_wf_runtime: float | None = None
# GPU estimate based on the LLM latency
gpu_estimate_by_llm_latency: float | None = None
# Calc runner alerts are generated by the calculator.
class CalcAlerts(BaseModel):
"""
Calc runner alerts.
"""
# if true, the run was identified as an outlier by the workflow runtime linear fit
outlier_workflow_runtime: bool = False
# if true, the run was identified as an outlier by the LLM latency linear fit
outlier_llm_latency: bool = False
# number of items that are greater than the target latency
num_items_greater_than_target_latency: int = 0
# number of items that are greater than the target runtime
num_items_greater_than_target_runtime: int = 0
class CalcData(BaseModel):
"""
Output of the calc runner per concurrency.
"""
# ROUGH GPU estimates per concurrency: these are not used for the final GPU estimation
# they are only available for information purposes
gpu_estimates: GPUEstimates = Field(default_factory=GPUEstimates)
# Calc runner alerts
alerts: CalcAlerts = Field(default_factory=CalcAlerts)
# Sizing metrics
sizing_metrics: SizingMetrics = Field(default_factory=SizingMetrics)
class CalcRunnerOutput(BaseModel):
"""
Output of the calc runner.
"""
# GPU estimates based on the slope of the time vs concurrency, calculated online or offline
gpu_estimates: GPUEstimates = Field(default_factory=GPUEstimates)
# Linear fit results for analysis and debugging
fit_results: FitResults = Field(default_factory=FitResults)
# Per-concurrency data (GPU estimates, out-of-range runs, and sizing metrics)
calc_data: dict[int, CalcData] = {}