NeMo-Agent-Toolkit/src/nat/profiler/calc/data_models.py at develop · BuildOnCloud/NeMo-Agent-Toolkit · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import typing
from pathlib import Path

from pydantic import BaseModel
from pydantic import Field


class FitConfig(BaseModel):
    """
    Configuration parameters for linear fit and outlier detection.
    """
    # Threshold for small concurrency range (≤ 8 points) to check for extreme outliers in raw y-values first
    small_concurrency_range_threshold: int = 8

    # Extreme outlier threshold is 2.0 times the IQR, extreme outliers are removed
    extreme_outlier_threshold: float = 2.0

    # Conservative outlier threshold is 1.5 times the IQR, conservative outliers are removed
    conservative_outlier_threshold: float = 1.5

    # Minimum R-squared value required for a valid linear fit
    min_r_squared: float = 0.7

    # Whether to remove outliers during linear fit calculation
    remove_outliers: bool = True


class CalcRunnerConfig(BaseModel):
    """
    Parameters used for a calc runner.
    """
    # base config and endpoints (if remote)- not needed in offline mode
    config_file: Path | None = None
    # endpoint to use for the workflow, if not provided the workflow is run locally
    endpoint: str | None = None
    # timeout for the workflow
    endpoint_timeout: int = 300

    # if true workflow is not run, instead results from previous runs are used to estimate the
    # GPU count
    offline_mode: bool = False

    # number of passes at each concurrency, if 0 the dataset is adjusted to a multiple of the
    # concurrency
    num_passes: int = 0
    # concurrency values to test
    concurrencies: list[int] = [1, 2, 4, 8]

    # Targets for GPU estimation
    target_llm_latency_p95: float = 0
    target_workflow_runtime_p95: float = 0
    target_users: int = 0

    # Test setup information needed for GPU estimation
    test_gpu_count: int = 0

    # output directory for results
    output_dir: Path | None = None
    # if true, the job is stored in a new subdirectory of the output directory
    append_job: bool = False
    # if true, the data is plotted
    plot_data: bool = True

    # Configuration for linear fit and outlier detection
    fit_config: FitConfig = Field(default_factory=FitConfig)


# Sizing metrics are gathered from the evaluation runs and used as input by the calculator.
class SizingMetricPerItem(BaseModel):
    """
    Sizing metrics per dataset entry item.
    """
    # LLM latency
    llm_latency: float
    # workflow runtime
    workflow_runtime: float


class SizingMetricsAlerts(BaseModel):
    """
    Sizing metrics alerts.
    """
    # if true, the workflow was interrupted that concurrency cannot be used
    workflow_interrupted: bool = False


class SizingMetrics(BaseModel):
    """
    Sizing metrics for a single concurrency.
    """
    # alerts associated with the sizing metrics
    alerts: SizingMetricsAlerts = Field(default_factory=SizingMetricsAlerts)

    # p95 LLM latency
    llm_latency_p95: float = 0.0
    # p95 workflow runtime
    workflow_runtime_p95: float = 0.0
    # total workflow runtime
    total_runtime: float = 0.0
    # per item metrics, key is the dataset entry id
    per_item_metrics: dict[typing.Any, SizingMetricPerItem] = {}


class LinearFitResult(BaseModel):
    """
    Result of linear regression including slope, intercept, and quality metrics.
    """
    slope: float
    intercept: float
    r_squared: float
    outliers_removed: list[int]


class FitResults(BaseModel):
    """
    Linear fit results for both LLM latency and workflow runtime analysis.
    """
    llm_latency_fit: LinearFitResult | None = None
    wf_runtime_fit: LinearFitResult | None = None


# GPU estimates are generated by the calculator.
class GPUEstimates(BaseModel):
    """
    GPU estimates.
    """
    # GPU estimate based on the workflow runtime
    gpu_estimate_by_wf_runtime: float | None = None
    # GPU estimate based on the LLM latency
    gpu_estimate_by_llm_latency: float | None = None


# Calc runner alerts are generated by the calculator.
class CalcAlerts(BaseModel):
    """
    Calc runner alerts.
    """
    # if true, the run was identified as an outlier by the workflow runtime linear fit
    outlier_workflow_runtime: bool = False
    # if true, the run was identified as an outlier by the LLM latency linear fit
    outlier_llm_latency: bool = False

    # number of items that are greater than the target latency
    num_items_greater_than_target_latency: int = 0
    # number of items that are greater than the target runtime
    num_items_greater_than_target_runtime: int = 0


class CalcData(BaseModel):
    """
    Output of the calc runner per concurrency.
    """
    # ROUGH GPU estimates per concurrency: these are not used for the final GPU estimation
    # they are only available for information purposes
    gpu_estimates: GPUEstimates = Field(default_factory=GPUEstimates)
    # Calc runner alerts
    alerts: CalcAlerts = Field(default_factory=CalcAlerts)
    # Sizing metrics
    sizing_metrics: SizingMetrics = Field(default_factory=SizingMetrics)


class CalcRunnerOutput(BaseModel):
    """
    Output of the calc runner.
    """
    # GPU estimates based on the slope of the time vs concurrency, calculated online or offline
    gpu_estimates: GPUEstimates = Field(default_factory=GPUEstimates)

    # Linear fit results for analysis and debugging
    fit_results: FitResults = Field(default_factory=FitResults)

    # Per-concurrency data (GPU estimates, out-of-range runs, and sizing metrics)
    calc_data: dict[int, CalcData] = {}