From cdc0c92f4bf99fad79bd1986b45de42a28b57dd9 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Wed, 8 Oct 2025 15:24:30 -0700 Subject: [PATCH 001/133] provision Azure Managed Grafana workspace --- azure-managed-grafana.bicep | 57 +++++++++ azure-pipelines-managed-grafana.yml | 49 ++++++++ eng/deploy-managed-grafana.yml | 30 +++++ eng/provision-grafana.yaml | 180 ++++++++++++++++++++++++++++ 4 files changed, 316 insertions(+) create mode 100644 azure-managed-grafana.bicep create mode 100644 azure-pipelines-managed-grafana.yml create mode 100644 eng/deploy-managed-grafana.yml create mode 100644 eng/provision-grafana.yaml diff --git a/azure-managed-grafana.bicep b/azure-managed-grafana.bicep new file mode 100644 index 000000000..c5b647324 --- /dev/null +++ b/azure-managed-grafana.bicep @@ -0,0 +1,57 @@ +// Azure Managed Grafana Workspace Bicep Template +@description('The Azure region where the Grafana workspace will be deployed') +param location string + +@description('The name of the Grafana workspace') +param grafanaWorkspaceName string + +@description('The pricing tier for the Grafana workspace') +@allowed([ + 'Standard' + 'Essential' +]) +param skuName string = 'Standard' + +@description('Object ID of the .NET Eng Services Azure AD group') +param dotnetEngServicesGroupObjectId string + +// Azure Managed Grafana Workspace +resource grafanaWorkspace 'Microsoft.Dashboard/grafana@2023-09-01' = { + name: grafanaWorkspaceName + location: location + sku: { + name: skuName + } + identity: { + type: 'SystemAssigned' + } + properties: { + deterministicOutboundIP: 'Enabled' + apiKey: 'Enabled' + autoGeneratedDomainNameLabelScope: 'TenantReuse' + zoneRedundancy: 'Disabled' + publicNetworkAccess: 'Enabled' + grafanaIntegrations: { + azureMonitorWorkspaceIntegrations: [] + } + } +} + +// Role assignment to grant .NET Eng Services group Grafana Admin access +resource grafanaAdminRoleAssignment 'Microsoft.Authorization/roleAssignments@2022-04-01' = { + name: guid(grafanaWorkspace.id, dotnetEngServicesGroupObjectId, 'Grafana Admin') + scope: grafanaWorkspace + properties: { + roleDefinitionId: subscriptionResourceId('Microsoft.Authorization/roleDefinitions', '22926164-76b3-42b3-bc55-97df8dab3e41') // Grafana Admin role + principalId: dotnetEngServicesGroupObjectId + principalType: 'Group' + } +} + +// Output the Grafana workspace details +output grafanaWorkspaceId string = grafanaWorkspace.id +output grafanaWorkspaceName string = grafanaWorkspace.name +output grafanaWorkspaceUrl string = grafanaWorkspace.properties.endpoint +output grafanaPrincipalId string = grafanaWorkspace.identity.principalId +output grafanaTenantId string = grafanaWorkspace.identity.tenantId +output grafanaWorkspaceLocation string = grafanaWorkspace.location diff --git a/azure-pipelines-managed-grafana.yml b/azure-pipelines-managed-grafana.yml new file mode 100644 index 000000000..470180949 --- /dev/null +++ b/azure-pipelines-managed-grafana.yml @@ -0,0 +1,49 @@ +trigger: + batch: true + branches: + include: + - haruna/managed-grafana-new + - production +pr: none + +resources: + repositories: + - repository: 1ESPipelineTemplates + type: git + name: 1ESPipelineTemplates/1ESPipelineTemplates + ref: refs/tags/release +extends: + template: v1/1ES.Official.PipelineTemplate.yml@1ESPipelineTemplates + parameters: + pool: + name: NetCore1ESPool-Internal + image: 1es-windows-2019 + os: windows + sdl: + policheck: + enabled: true + tsa: + enabled: true + + stages: + - ${{ if in(variables['Build.SourceBranch'], 'refs/heads/haruna/managed-grafana-new', 'refs/heads/production')}}: + - template: /eng/deploy-managed-grafana.yml@self + parameters: + ${{ if ne(variables['Build.SourceBranch'], 'refs/heads/production') }}: + DeploymentEnvironment: Staging + ServiceConnectionName: dnceng-managed-grafana-staging + GrafanaWorkspaceName: dnceng-grafana-staging + GrafanaKeyVault: dnceng-grafana-int-kv + GrafanaVariableGroup: Dnceng-Managed-Grafana-Staging-Vg + ServiceConnectionClientId: 4ad9ae35-2d42-4245-a954-9003b7e31349 + ServiceConnectionId: f955b932-c7e3-48f7-9d67-4e6542b3568a + ${{ else }}: + DeploymentEnvironment: Production + ServiceConnectionName: dnceng-managed-grafana + GrafanaWorkspaceName: dnceng-grafana + GrafanaKeyVault: dnceng-grafana-prod-kv + GrafanaVariableGroup: Dnceng-Managed-Grafana-Vg + ServiceConnectionClientId: 0ceeca1a-31e7-49ee-9bf4-15f14ed28fa4 + ServiceConnectionId: 332b249e-769b-49a9-9dc9-d82afe28ec0a + + diff --git a/eng/deploy-managed-grafana.yml b/eng/deploy-managed-grafana.yml new file mode 100644 index 000000000..edc0b0e13 --- /dev/null +++ b/eng/deploy-managed-grafana.yml @@ -0,0 +1,30 @@ +parameters: +- name: ServiceConnectionName + type: string +- name: ServiceConnectionClientId + type: string +- name: ServiceConnectionId + type: string +- name: DeploymentEnvironment + type: string +- name: GrafanaWorkspaceName + type: string +- name: GrafanaKeyVault + type: string +- name: GrafanaVariableGroup + type: string + + +stages: +- stage: ProvisionGrafana + displayName: 'Provision Grafana Infrastructure' + jobs: + - template: /eng/provision-grafana.yaml@self + parameters: + DeploymentEnvironment: ${{ parameters.DeploymentEnvironment }} + ServiceConnectionName: ${{ parameters.ServiceConnectionName }} + GrafanaResourceGroup: 'monitoring-managed' + GrafanaWorkspaceName: ${{ parameters.GrafanaWorkspaceName }} + GrafanaLocation: 'westus2' + GrafanaKeyVault: ${{ parameters.GrafanaKeyVault }} + GrafanaVariableGroup: ${{ parameters.GrafanaVariableGroup }} \ No newline at end of file diff --git a/eng/provision-grafana.yaml b/eng/provision-grafana.yaml new file mode 100644 index 000000000..4acc3d91a --- /dev/null +++ b/eng/provision-grafana.yaml @@ -0,0 +1,180 @@ +# Azure Managed Grafana Provisioning Template +# This template provisions Azure Managed Grafana workspaces as part of the deployment process + +parameters: +- name: DeploymentEnvironment + type: string + +- name: ServiceConnectionName + type: string + +- name: GrafanaResourceGroup + type: string + +- name: GrafanaWorkspaceName + type: string + +- name: GrafanaLocation + type: string + +- name: GrafanaKeyVault + type: string + +- name: GrafanaVariableGroup + type: string + +- name: SkipGrafanaProvisioning + type: boolean + default: false + +variables: + - group: ${{ parameters.GrafanaVariableGroup }} + +jobs: +- job: ProvisionGrafana + displayName: 'Provision Azure Managed Grafana' + condition: and(succeeded(), not('${{ parameters.SkipGrafanaProvisioning }}')) + pool: + name: NetCore1ESPool-Internal + demands: ImageOverride -equals 1es-windows-2022 + + steps: + - checkout: self + displayName: 'Checkout Repository' + + - task: AzureCLI@2 + displayName: 'Install Bicep CLI' + inputs: + azureSubscription: '${{ parameters.ServiceConnectionName }}' + scriptType: 'pwsh' + scriptLocation: 'inlineScript' + inlineScript: | + Write-Host "Installing Bicep CLI..." + az bicep install + az bicep version + Write-Host "āœ… Bicep CLI installed successfully" + + - task: AzureCLI@2 + displayName: 'Validate Bicep Template' + inputs: + azureSubscription: '${{ parameters.ServiceConnectionName }}' + scriptType: 'pwsh' + scriptLocation: 'inlineScript' + inlineScript: | + Write-Host "Validating Grafana Bicep template..." + if (!(Test-Path "azure-managed-grafana.bicep")) { + throw "Bicep template not found: azure-managed-grafana.bicep" + } + + az bicep build --file azure-managed-grafana.bicep + if ($LASTEXITCODE -ne 0) { + throw "Bicep template validation failed" + } + Write-Host "āœ… Bicep template validation successful" + + - task: AzureCLI@2 + displayName: 'Ensure Resource Group Exists' + inputs: + azureSubscription: '${{ parameters.ServiceConnectionName }}' + scriptType: 'pwsh' + scriptLocation: 'inlineScript' + inlineScript: | + $rgName = "${{ parameters.GrafanaResourceGroup }}" + $location = "${{ parameters.GrafanaLocation }}" + + Write-Host "Checking if resource group '$rgName' exists..." + $rg = az group show --name $rgName --query "name" --output tsv 2>$null + + if ($LASTEXITCODE -ne 0) { + Write-Host "Creating resource group '$rgName' in '$location'..." + az group create --name $rgName --location $location + if ($LASTEXITCODE -ne 0) { + throw "Failed to create resource group '$rgName'" + } + Write-Host "āœ… Resource group created successfully" + } else { + Write-Host "āœ… Resource group already exists" + } + + - task: AzureResourceManagerTemplateDeployment@3 + displayName: 'Deploy Grafana Workspace' + inputs: + deploymentScope: 'Resource Group' + azureResourceManagerConnection: '${{ parameters.ServiceConnectionName }}' + action: 'Create Or Update Resource Group' + resourceGroupName: '${{ parameters.GrafanaResourceGroup }}' + location: '${{ parameters.GrafanaLocation }}' + templateLocation: 'Linked artifact' + csmFile: 'azure-managed-grafana.bicep' + overrideParameters: | + -subscriptionId $(az account show --query id --output tsv) + -resourceGroupName "${{ parameters.GrafanaResourceGroup }}" + -location "${{ parameters.GrafanaLocation }}" + -grafanaWorkspaceName "${{ parameters.GrafanaWorkspaceName }}" + -skuName "Standard" + -dotnetEngServicesGroupObjectId "$(dotnet-eng-services-group-object-id)" + deploymentMode: 'Incremental' + deploymentName: 'grafana-${{ parameters.DeploymentEnvironment }}-$(Build.BuildNumber)' + deploymentOutputs: 'grafanaOutputs' + + - task: AzureCLI@2 + displayName: 'Verify Grafana Deployment' + inputs: + azureSubscription: '${{ parameters.ServiceConnectionName }}' + scriptType: 'pwsh' + scriptLocation: 'inlineScript' + inlineScript: | + $workspaceName = "${{ parameters.GrafanaWorkspaceName }}" + $rgName = "${{ parameters.GrafanaResourceGroup }}" + + Write-Host "Verifying Grafana workspace deployment..." + + # Wait for deployment to complete + $maxAttempts = 30 + $attempt = 0 + do { + $attempt++ + Write-Host "Verification attempt $attempt of $maxAttempts..." + + $workspace = az grafana show --name $workspaceName --resource-group $rgName 2>$null | ConvertFrom-Json + if ($workspace -and $workspace.properties.provisioningState -eq "Succeeded") { + break + } + + if ($attempt -lt $maxAttempts) { + Write-Host "Workspace not ready yet, waiting 30 seconds..." + Start-Sleep -Seconds 30 + } + } while ($attempt -lt $maxAttempts) + + if (!$workspace) { + throw "Failed to verify Grafana workspace deployment" + } + + Write-Host "šŸ“Š Grafana Workspace Details:" + Write-Host " Name: $($workspace.name)" + Write-Host " URL: $($workspace.properties.endpoint)" + Write-Host " Location: $($workspace.location)" + Write-Host " SKU: $($workspace.sku.name)" + Write-Host " Status: $($workspace.properties.provisioningState)" + Write-Host " Identity: $($workspace.identity.principalId)" + + # Verify role assignments + Write-Host "Checking role assignments..." + $roleAssignments = az role assignment list --scope $workspace.id --query "[].{principalId:principalId, roleDefinitionName:roleDefinitionName}" 2>$null | ConvertFrom-Json + if ($roleAssignments) { + $roleAssignments | ForEach-Object { + Write-Host " Role: $($_.roleDefinitionName) - Principal: $($_.principalId)" + } + } else { + Write-Host " No role assignments found" + } + + # Store outputs for downstream usage + Write-Host "##vso[task.setvariable variable=GrafanaUrl;isOutput=true]$($workspace.properties.endpoint)" + Write-Host "##vso[task.setvariable variable=GrafanaPrincipalId;isOutput=true]$($workspace.identity.principalId)" + Write-Host "##vso[task.setvariable variable=GrafanaResourceId;isOutput=true]$($workspace.id)" + + Write-Host "āœ… ${{ parameters.DeploymentEnvironment }} Grafana deployment verification completed" + + \ No newline at end of file From fb7997db3aee7c3c3ce7a11578de0b0c2f3c5496 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Wed, 8 Oct 2025 16:57:54 -0700 Subject: [PATCH 002/133] put variables in the right position --- eng/provision-grafana.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/eng/provision-grafana.yaml b/eng/provision-grafana.yaml index 4acc3d91a..536cb2169 100644 --- a/eng/provision-grafana.yaml +++ b/eng/provision-grafana.yaml @@ -27,9 +27,6 @@ parameters: type: boolean default: false -variables: - - group: ${{ parameters.GrafanaVariableGroup }} - jobs: - job: ProvisionGrafana displayName: 'Provision Azure Managed Grafana' @@ -38,6 +35,9 @@ jobs: name: NetCore1ESPool-Internal demands: ImageOverride -equals 1es-windows-2022 + variables: + - group: ${{ parameters.GrafanaVariableGroup }} + steps: - checkout: self displayName: 'Checkout Repository' From e13fb60ed26b7ec00ba8230ec09c9b0c473e2ef2 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Wed, 8 Oct 2025 17:02:25 -0700 Subject: [PATCH 003/133] remove conditional statement --- eng/provision-grafana.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/eng/provision-grafana.yaml b/eng/provision-grafana.yaml index 536cb2169..e0eac777c 100644 --- a/eng/provision-grafana.yaml +++ b/eng/provision-grafana.yaml @@ -30,7 +30,6 @@ parameters: jobs: - job: ProvisionGrafana displayName: 'Provision Azure Managed Grafana' - condition: and(succeeded(), not('${{ parameters.SkipGrafanaProvisioning }}')) pool: name: NetCore1ESPool-Internal demands: ImageOverride -equals 1es-windows-2022 From 03b5d4f7ed4482ee58b6f5554faef490d67884dd Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Wed, 8 Oct 2025 17:06:01 -0700 Subject: [PATCH 004/133] update windows to use 1es-windows-2022 --- azure-pipelines-managed-grafana.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/azure-pipelines-managed-grafana.yml b/azure-pipelines-managed-grafana.yml index 470180949..51bd53271 100644 --- a/azure-pipelines-managed-grafana.yml +++ b/azure-pipelines-managed-grafana.yml @@ -17,7 +17,7 @@ extends: parameters: pool: name: NetCore1ESPool-Internal - image: 1es-windows-2019 + image: 1es-windows-2022 os: windows sdl: policheck: From 4a5ed6d43f0d12136afea219b70cbf59df1ec298 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Wed, 8 Oct 2025 17:42:44 -0700 Subject: [PATCH 005/133] remove bicep installation task --- eng/provision-grafana.yaml | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) diff --git a/eng/provision-grafana.yaml b/eng/provision-grafana.yaml index e0eac777c..d793f1375 100644 --- a/eng/provision-grafana.yaml +++ b/eng/provision-grafana.yaml @@ -41,23 +41,11 @@ jobs: - checkout: self displayName: 'Checkout Repository' - - task: AzureCLI@2 - displayName: 'Install Bicep CLI' - inputs: - azureSubscription: '${{ parameters.ServiceConnectionName }}' - scriptType: 'pwsh' - scriptLocation: 'inlineScript' - inlineScript: | - Write-Host "Installing Bicep CLI..." - az bicep install - az bicep version - Write-Host "āœ… Bicep CLI installed successfully" - - task: AzureCLI@2 displayName: 'Validate Bicep Template' inputs: azureSubscription: '${{ parameters.ServiceConnectionName }}' - scriptType: 'pwsh' + scriptType: 'ps' scriptLocation: 'inlineScript' inlineScript: | Write-Host "Validating Grafana Bicep template..." @@ -75,7 +63,7 @@ jobs: displayName: 'Ensure Resource Group Exists' inputs: azureSubscription: '${{ parameters.ServiceConnectionName }}' - scriptType: 'pwsh' + scriptType: 'ps' scriptLocation: 'inlineScript' inlineScript: | $rgName = "${{ parameters.GrafanaResourceGroup }}" @@ -120,7 +108,7 @@ jobs: displayName: 'Verify Grafana Deployment' inputs: azureSubscription: '${{ parameters.ServiceConnectionName }}' - scriptType: 'pwsh' + scriptType: 'ps' scriptLocation: 'inlineScript' inlineScript: | $workspaceName = "${{ parameters.GrafanaWorkspaceName }}" From ec7b9565b7a7fb3af2683706a3c3b6a4c0f475ae Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Wed, 8 Oct 2025 20:56:53 -0700 Subject: [PATCH 006/133] remove parameters that are not needed --- eng/provision-grafana.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/eng/provision-grafana.yaml b/eng/provision-grafana.yaml index d793f1375..22c4b029e 100644 --- a/eng/provision-grafana.yaml +++ b/eng/provision-grafana.yaml @@ -94,8 +94,6 @@ jobs: templateLocation: 'Linked artifact' csmFile: 'azure-managed-grafana.bicep' overrideParameters: | - -subscriptionId $(az account show --query id --output tsv) - -resourceGroupName "${{ parameters.GrafanaResourceGroup }}" -location "${{ parameters.GrafanaLocation }}" -grafanaWorkspaceName "${{ parameters.GrafanaWorkspaceName }}" -skuName "Standard" From 28e0edaa6f05aeaa69e4ce44e85570d73c28aa81 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Wed, 8 Oct 2025 21:41:48 -0700 Subject: [PATCH 007/133] changed parameters file format for bicep --- eng/provision-grafana.yaml | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/eng/provision-grafana.yaml b/eng/provision-grafana.yaml index 22c4b029e..b1b9eda01 100644 --- a/eng/provision-grafana.yaml +++ b/eng/provision-grafana.yaml @@ -94,10 +94,20 @@ jobs: templateLocation: 'Linked artifact' csmFile: 'azure-managed-grafana.bicep' overrideParameters: | - -location "${{ parameters.GrafanaLocation }}" - -grafanaWorkspaceName "${{ parameters.GrafanaWorkspaceName }}" - -skuName "Standard" - -dotnetEngServicesGroupObjectId "$(dotnet-eng-services-group-object-id)" + { + "location": { + "value": "${{ parameters.GrafanaLocation }}" + }, + "grafanaWorkspaceName": { + "value": "${{ parameters.GrafanaWorkspaceName }}" + }, + "skuName": { + "value": "Standard" + }, + "dotnetEngServicesGroupObjectId": { + "value": "$(dotnet-eng-services-group-object-id)" + } + } deploymentMode: 'Incremental' deploymentName: 'grafana-${{ parameters.DeploymentEnvironment }}-$(Build.BuildNumber)' deploymentOutputs: 'grafanaOutputs' From a5805d6036a102c69f0b10d19ce780011a8dedc0 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Wed, 8 Oct 2025 22:20:47 -0700 Subject: [PATCH 008/133] changed parameters file format for bicep --- eng/provision-grafana.yaml | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/eng/provision-grafana.yaml b/eng/provision-grafana.yaml index b1b9eda01..c80552a0c 100644 --- a/eng/provision-grafana.yaml +++ b/eng/provision-grafana.yaml @@ -93,21 +93,7 @@ jobs: location: '${{ parameters.GrafanaLocation }}' templateLocation: 'Linked artifact' csmFile: 'azure-managed-grafana.bicep' - overrideParameters: | - { - "location": { - "value": "${{ parameters.GrafanaLocation }}" - }, - "grafanaWorkspaceName": { - "value": "${{ parameters.GrafanaWorkspaceName }}" - }, - "skuName": { - "value": "Standard" - }, - "dotnetEngServicesGroupObjectId": { - "value": "$(dotnet-eng-services-group-object-id)" - } - } + overrideParameters: '-location "${{ parameters.GrafanaLocation }}" -grafanaWorkspaceName "${{ parameters.GrafanaWorkspaceName }}" -skuName "Standard" -dotnetEngServicesGroupObjectId "$(dotnet-eng-services-group-object-id)"' deploymentMode: 'Incremental' deploymentName: 'grafana-${{ parameters.DeploymentEnvironment }}-$(Build.BuildNumber)' deploymentOutputs: 'grafanaOutputs' From 5424b0c4424f4facdd0ee2850944a0d9883c0996 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Wed, 8 Oct 2025 23:48:41 -0700 Subject: [PATCH 009/133] remove role assignment from bicep --- eng/provision-grafana.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eng/provision-grafana.yaml b/eng/provision-grafana.yaml index c80552a0c..bcd8a16fb 100644 --- a/eng/provision-grafana.yaml +++ b/eng/provision-grafana.yaml @@ -93,7 +93,7 @@ jobs: location: '${{ parameters.GrafanaLocation }}' templateLocation: 'Linked artifact' csmFile: 'azure-managed-grafana.bicep' - overrideParameters: '-location "${{ parameters.GrafanaLocation }}" -grafanaWorkspaceName "${{ parameters.GrafanaWorkspaceName }}" -skuName "Standard" -dotnetEngServicesGroupObjectId "$(dotnet-eng-services-group-object-id)"' + overrideParameters: '-location "${{ parameters.GrafanaLocation }}" -grafanaWorkspaceName "${{ parameters.GrafanaWorkspaceName }}" -skuName "Standard"' deploymentMode: 'Incremental' deploymentName: 'grafana-${{ parameters.DeploymentEnvironment }}-$(Build.BuildNumber)' deploymentOutputs: 'grafanaOutputs' From e0004fcdf864d735f32b7a9140a1e1559b47bba3 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Wed, 8 Oct 2025 23:49:55 -0700 Subject: [PATCH 010/133] remove role assignment from bicep --- azure-managed-grafana.bicep | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/azure-managed-grafana.bicep b/azure-managed-grafana.bicep index c5b647324..21d97e5ec 100644 --- a/azure-managed-grafana.bicep +++ b/azure-managed-grafana.bicep @@ -12,9 +12,6 @@ param grafanaWorkspaceName string ]) param skuName string = 'Standard' -@description('Object ID of the .NET Eng Services Azure AD group') -param dotnetEngServicesGroupObjectId string - // Azure Managed Grafana Workspace resource grafanaWorkspace 'Microsoft.Dashboard/grafana@2023-09-01' = { name: grafanaWorkspaceName @@ -37,17 +34,6 @@ resource grafanaWorkspace 'Microsoft.Dashboard/grafana@2023-09-01' = { } } -// Role assignment to grant .NET Eng Services group Grafana Admin access -resource grafanaAdminRoleAssignment 'Microsoft.Authorization/roleAssignments@2022-04-01' = { - name: guid(grafanaWorkspace.id, dotnetEngServicesGroupObjectId, 'Grafana Admin') - scope: grafanaWorkspace - properties: { - roleDefinitionId: subscriptionResourceId('Microsoft.Authorization/roleDefinitions', '22926164-76b3-42b3-bc55-97df8dab3e41') // Grafana Admin role - principalId: dotnetEngServicesGroupObjectId - principalType: 'Group' - } -} - // Output the Grafana workspace details output grafanaWorkspaceId string = grafanaWorkspace.id output grafanaWorkspaceName string = grafanaWorkspace.name From 0ef9267b3b55e08ed1c56a2624f024857db8604e Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Thu, 9 Oct 2025 00:11:24 -0700 Subject: [PATCH 011/133] changed parameters file format for bicep --- eng/provision-grafana.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eng/provision-grafana.yaml b/eng/provision-grafana.yaml index bcd8a16fb..786726015 100644 --- a/eng/provision-grafana.yaml +++ b/eng/provision-grafana.yaml @@ -142,7 +142,7 @@ jobs: # Verify role assignments Write-Host "Checking role assignments..." - $roleAssignments = az role assignment list --scope $workspace.id --query "[].{principalId:principalId, roleDefinitionName:roleDefinitionName}" 2>$null | ConvertFrom-Json + $roleAssignments = az role assignment list --scope $workspace.id --query '[].{principalId:principalId, roleDefinitionName:roleDefinitionName}' 2>$null | ConvertFrom-Json if ($roleAssignments) { $roleAssignments | ForEach-Object { Write-Host " Role: $($_.roleDefinitionName) - Principal: $($_.principalId)" From 0382325c7793d6d6dcfccd31779fcad13170eb44 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Thu, 9 Oct 2025 00:27:16 -0700 Subject: [PATCH 012/133] changed parameters file format for bicep --- eng/provision-grafana.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/eng/provision-grafana.yaml b/eng/provision-grafana.yaml index 786726015..30b38d369 100644 --- a/eng/provision-grafana.yaml +++ b/eng/provision-grafana.yaml @@ -57,7 +57,7 @@ jobs: if ($LASTEXITCODE -ne 0) { throw "Bicep template validation failed" } - Write-Host "āœ… Bicep template validation successful" + Write-Host "SUCCESS: Bicep template validation successful" - task: AzureCLI@2 displayName: 'Ensure Resource Group Exists' @@ -78,9 +78,9 @@ jobs: if ($LASTEXITCODE -ne 0) { throw "Failed to create resource group '$rgName'" } - Write-Host "āœ… Resource group created successfully" + Write-Host "SUCCESS: Resource group created successfully" } else { - Write-Host "āœ… Resource group already exists" + Write-Host "SUCCESS: Resource group already exists" } - task: AzureResourceManagerTemplateDeployment@3 @@ -132,7 +132,7 @@ jobs: throw "Failed to verify Grafana workspace deployment" } - Write-Host "šŸ“Š Grafana Workspace Details:" + Write-Host "GRAFANA WORKSPACE DETAILS:" Write-Host " Name: $($workspace.name)" Write-Host " URL: $($workspace.properties.endpoint)" Write-Host " Location: $($workspace.location)" @@ -156,6 +156,6 @@ jobs: Write-Host "##vso[task.setvariable variable=GrafanaPrincipalId;isOutput=true]$($workspace.identity.principalId)" Write-Host "##vso[task.setvariable variable=GrafanaResourceId;isOutput=true]$($workspace.id)" - Write-Host "āœ… ${{ parameters.DeploymentEnvironment }} Grafana deployment verification completed" + Write-Host "SUCCESS: ${{ parameters.DeploymentEnvironment }} Grafana deployment verification completed" \ No newline at end of file From e8a9e30e71ea774d2a8b7f42bb65346867e9faad Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Thu, 9 Oct 2025 10:22:23 -0700 Subject: [PATCH 013/133] add task to install amg extension --- eng/provision-grafana.yaml | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/eng/provision-grafana.yaml b/eng/provision-grafana.yaml index 30b38d369..6a2713a82 100644 --- a/eng/provision-grafana.yaml +++ b/eng/provision-grafana.yaml @@ -98,6 +98,21 @@ jobs: deploymentName: 'grafana-${{ parameters.DeploymentEnvironment }}-$(Build.BuildNumber)' deploymentOutputs: 'grafanaOutputs' + - task: AzureCLI@2 + displayName: 'Install Azure Managed Grafana Extension' + inputs: + azureSubscription: '${{ parameters.ServiceConnectionName }}' + scriptType: 'ps' + scriptLocation: 'inlineScript' + inlineScript: | + Write-Host "Installing Azure CLI Azure Managed Grafana extension..." + az extension add --name amg --allow-preview-versions --yes + if ($LASTEXITCODE -ne 0) { + Write-Host "Warning: Failed to install amg extension, will use alternative verification method" + } else { + Write-Host "SUCCESS: Azure Managed Grafana extension installed" + } + - task: AzureCLI@2 displayName: 'Verify Grafana Deployment' inputs: From aa6018d058d405ba7fb73ccbc480907439ca7e83 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Thu, 9 Oct 2025 10:38:31 -0700 Subject: [PATCH 014/133] remove allow-preview-versions flag --- eng/provision-grafana.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eng/provision-grafana.yaml b/eng/provision-grafana.yaml index 6a2713a82..3bc4f575c 100644 --- a/eng/provision-grafana.yaml +++ b/eng/provision-grafana.yaml @@ -106,7 +106,7 @@ jobs: scriptLocation: 'inlineScript' inlineScript: | Write-Host "Installing Azure CLI Azure Managed Grafana extension..." - az extension add --name amg --allow-preview-versions --yes + az extension add --name amg if ($LASTEXITCODE -ne 0) { Write-Host "Warning: Failed to install amg extension, will use alternative verification method" } else { From ac429bd7ca6c74b03e60a7a0029de8d04372ca71 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Thu, 9 Oct 2025 11:27:01 -0700 Subject: [PATCH 015/133] assign grafana admin role to .net eng services --- azure-managed-grafana.bicep | 17 +++++++++++++++++ azure-pipelines-managed-grafana.yml | 2 ++ eng/deploy-managed-grafana.yml | 5 ++++- eng/provision-grafana.yaml | 5 ++++- 4 files changed, 27 insertions(+), 2 deletions(-) diff --git a/azure-managed-grafana.bicep b/azure-managed-grafana.bicep index 21d97e5ec..537dcb51d 100644 --- a/azure-managed-grafana.bicep +++ b/azure-managed-grafana.bicep @@ -12,6 +12,12 @@ param grafanaWorkspaceName string ]) param skuName string = 'Standard' +@description('Object ID of the .NET Eng Services Azure AD group') +param dotnetEngServicesGroupObjectId string = '' + +@description('Whether to create role assignment for .NET Eng Services group') +param createRoleAssignment bool = true + // Azure Managed Grafana Workspace resource grafanaWorkspace 'Microsoft.Dashboard/grafana@2023-09-01' = { name: grafanaWorkspaceName @@ -34,6 +40,17 @@ resource grafanaWorkspace 'Microsoft.Dashboard/grafana@2023-09-01' = { } } +// Role assignment to grant .NET Eng Services group Grafana Admin access +resource grafanaAdminRoleAssignment 'Microsoft.Authorization/roleAssignments@2022-04-01' = if (createRoleAssignment && !empty(dotnetEngServicesGroupObjectId)) { + name: guid(grafanaWorkspace.id, dotnetEngServicesGroupObjectId, 'Grafana Admin') + scope: grafanaWorkspace + properties: { + roleDefinitionId: subscriptionResourceId('Microsoft.Authorization/roleDefinitions', '22926164-76b3-42b3-bc55-97df8dab3e41') // Grafana Admin role + principalId: dotnetEngServicesGroupObjectId + principalType: 'Group' + } +} + // Output the Grafana workspace details output grafanaWorkspaceId string = grafanaWorkspace.id output grafanaWorkspaceName string = grafanaWorkspace.name diff --git a/azure-pipelines-managed-grafana.yml b/azure-pipelines-managed-grafana.yml index 51bd53271..d88774edf 100644 --- a/azure-pipelines-managed-grafana.yml +++ b/azure-pipelines-managed-grafana.yml @@ -37,6 +37,7 @@ extends: GrafanaVariableGroup: Dnceng-Managed-Grafana-Staging-Vg ServiceConnectionClientId: 4ad9ae35-2d42-4245-a954-9003b7e31349 ServiceConnectionId: f955b932-c7e3-48f7-9d67-4e6542b3568a + DotnetEngServicesGroupObjectId: "65d7fc1d-2744-4669-8779-5cd7d7a6b95b" ${{ else }}: DeploymentEnvironment: Production ServiceConnectionName: dnceng-managed-grafana @@ -45,5 +46,6 @@ extends: GrafanaVariableGroup: Dnceng-Managed-Grafana-Vg ServiceConnectionClientId: 0ceeca1a-31e7-49ee-9bf4-15f14ed28fa4 ServiceConnectionId: 332b249e-769b-49a9-9dc9-d82afe28ec0a + DotnetEngServicesGroupObjectId: "65d7fc1d-2744-4669-8779-5cd7d7a6b95b" diff --git a/eng/deploy-managed-grafana.yml b/eng/deploy-managed-grafana.yml index edc0b0e13..fb4aa3053 100644 --- a/eng/deploy-managed-grafana.yml +++ b/eng/deploy-managed-grafana.yml @@ -13,6 +13,8 @@ parameters: type: string - name: GrafanaVariableGroup type: string +- name: DotnetEngServicesGroupObjectId + type: string stages: @@ -27,4 +29,5 @@ stages: GrafanaWorkspaceName: ${{ parameters.GrafanaWorkspaceName }} GrafanaLocation: 'westus2' GrafanaKeyVault: ${{ parameters.GrafanaKeyVault }} - GrafanaVariableGroup: ${{ parameters.GrafanaVariableGroup }} \ No newline at end of file + GrafanaVariableGroup: ${{ parameters.GrafanaVariableGroup }} + DotnetEngServicesGroupObjectId: ${{ parameters.DotnetEngServicesGroupObjectId }} \ No newline at end of file diff --git a/eng/provision-grafana.yaml b/eng/provision-grafana.yaml index 3bc4f575c..ed356eac4 100644 --- a/eng/provision-grafana.yaml +++ b/eng/provision-grafana.yaml @@ -23,6 +23,9 @@ parameters: - name: GrafanaVariableGroup type: string +- name: DotnetEngServicesGroupObjectId + type: string + - name: SkipGrafanaProvisioning type: boolean default: false @@ -93,7 +96,7 @@ jobs: location: '${{ parameters.GrafanaLocation }}' templateLocation: 'Linked artifact' csmFile: 'azure-managed-grafana.bicep' - overrideParameters: '-location "${{ parameters.GrafanaLocation }}" -grafanaWorkspaceName "${{ parameters.GrafanaWorkspaceName }}" -skuName "Standard"' + overrideParameters: '-location "${{ parameters.GrafanaLocation }}" -grafanaWorkspaceName "${{ parameters.GrafanaWorkspaceName }}" -skuName "Standard" -dotnetEngServicesGroupObjectId "${{ parameters.DotnetEngServicesGroupObjectId }}" -createRoleAssignment true' deploymentMode: 'Incremental' deploymentName: 'grafana-${{ parameters.DeploymentEnvironment }}-$(Build.BuildNumber)' deploymentOutputs: 'grafanaOutputs' From 56839820dce762b70247726b40389d7d8ea911da Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Thu, 9 Oct 2025 12:50:29 -0700 Subject: [PATCH 016/133] assign grafana admin role to .net eng services --- azure-pipelines-managed-grafana.yml | 2 +- eng/deploy-managed-grafana.yml | 1 + eng/provision-grafana.yaml | 43 ++++++++++++++++++++++++++++- 3 files changed, 44 insertions(+), 2 deletions(-) diff --git a/azure-pipelines-managed-grafana.yml b/azure-pipelines-managed-grafana.yml index d88774edf..b0c045fd3 100644 --- a/azure-pipelines-managed-grafana.yml +++ b/azure-pipelines-managed-grafana.yml @@ -32,7 +32,7 @@ extends: ${{ if ne(variables['Build.SourceBranch'], 'refs/heads/production') }}: DeploymentEnvironment: Staging ServiceConnectionName: dnceng-managed-grafana-staging - GrafanaWorkspaceName: dnceng-grafana-staging + GrafanaWorkspaceName: dnceng-grafana-staging-1 GrafanaKeyVault: dnceng-grafana-int-kv GrafanaVariableGroup: Dnceng-Managed-Grafana-Staging-Vg ServiceConnectionClientId: 4ad9ae35-2d42-4245-a954-9003b7e31349 diff --git a/eng/deploy-managed-grafana.yml b/eng/deploy-managed-grafana.yml index fb4aa3053..7d93b679a 100644 --- a/eng/deploy-managed-grafana.yml +++ b/eng/deploy-managed-grafana.yml @@ -15,6 +15,7 @@ parameters: type: string - name: DotnetEngServicesGroupObjectId type: string + default: "65d7fc1d-2744-4669-8779-5cd7d7a6b95b" stages: diff --git a/eng/provision-grafana.yaml b/eng/provision-grafana.yaml index ed356eac4..82850c8b8 100644 --- a/eng/provision-grafana.yaml +++ b/eng/provision-grafana.yaml @@ -96,7 +96,7 @@ jobs: location: '${{ parameters.GrafanaLocation }}' templateLocation: 'Linked artifact' csmFile: 'azure-managed-grafana.bicep' - overrideParameters: '-location "${{ parameters.GrafanaLocation }}" -grafanaWorkspaceName "${{ parameters.GrafanaWorkspaceName }}" -skuName "Standard" -dotnetEngServicesGroupObjectId "${{ parameters.DotnetEngServicesGroupObjectId }}" -createRoleAssignment true' + overrideParameters: '-location "${{ parameters.GrafanaLocation }}" -grafanaWorkspaceName "${{ parameters.GrafanaWorkspaceName }}" -skuName "Standard" -dotnetEngServicesGroupObjectId "${{ parameters.DotnetEngServicesGroupObjectId }}" -createRoleAssignment false' deploymentMode: 'Incremental' deploymentName: 'grafana-${{ parameters.DeploymentEnvironment }}-$(Build.BuildNumber)' deploymentOutputs: 'grafanaOutputs' @@ -176,4 +176,45 @@ jobs: Write-Host "SUCCESS: ${{ parameters.DeploymentEnvironment }} Grafana deployment verification completed" + - task: AzureCLI@2 + displayName: 'Assign Grafana Admin Role to .NET Eng Services' + continueOnError: true + inputs: + azureSubscription: '${{ parameters.ServiceConnectionName }}' + scriptType: 'ps' + scriptLocation: 'inlineScript' + inlineScript: | + $workspaceName = "${{ parameters.GrafanaWorkspaceName }}" + $rgName = "${{ parameters.GrafanaResourceGroup }}" + $groupObjectId = "${{ parameters.DotnetEngServicesGroupObjectId }}" + + Write-Host "Attempting to assign Grafana Admin role to .NET Eng Services group..." + + # Get the Grafana workspace resource ID + $subscriptionId = az account show --query id --output tsv + $resourceId = "/subscriptions/$subscriptionId/resourceGroups/$rgName/providers/Microsoft.Dashboard/grafana/$workspaceName" + + # Check if role assignment already exists + $existingAssignment = az role assignment list --scope $resourceId --assignee $groupObjectId --role "Grafana Admin" --query "[0].id" --output tsv 2>$null + + if ($existingAssignment) { + Write-Host "SUCCESS: .NET Eng Services group already has Grafana Admin role" + } else { + # Try to assign the role + Write-Host "Assigning Grafana Admin role to .NET Eng Services group ($groupObjectId)..." + az role assignment create --assignee $groupObjectId --role "Grafana Admin" --scope $resourceId 2>$null + + if ($LASTEXITCODE -eq 0) { + Write-Host "SUCCESS: Grafana Admin role assigned to .NET Eng Services group" + } else { + Write-Host "WARNING: Failed to assign Grafana Admin role automatically" + Write-Host "Manual assignment required:" + Write-Host " 1. Go to Azure Portal > Resource Groups > $rgName > $workspaceName" + Write-Host " 2. Access control (IAM) > Add role assignment" + Write-Host " 3. Role: Grafana Admin" + Write-Host " 4. Assign access to: Group" + Write-Host " 5. Select: .NET Eng Services group" + } + } + \ No newline at end of file From 340831d496885cc25e19c8975ccf563faf406c2f Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Thu, 9 Oct 2025 13:03:13 -0700 Subject: [PATCH 017/133] assign grafana admin role to .net eng services --- azure-pipelines-managed-grafana.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/azure-pipelines-managed-grafana.yml b/azure-pipelines-managed-grafana.yml index b0c045fd3..6302a1bf2 100644 --- a/azure-pipelines-managed-grafana.yml +++ b/azure-pipelines-managed-grafana.yml @@ -32,7 +32,7 @@ extends: ${{ if ne(variables['Build.SourceBranch'], 'refs/heads/production') }}: DeploymentEnvironment: Staging ServiceConnectionName: dnceng-managed-grafana-staging - GrafanaWorkspaceName: dnceng-grafana-staging-1 + GrafanaWorkspaceName: dnceng-grafana-staging1 GrafanaKeyVault: dnceng-grafana-int-kv GrafanaVariableGroup: Dnceng-Managed-Grafana-Staging-Vg ServiceConnectionClientId: 4ad9ae35-2d42-4245-a954-9003b7e31349 From c76bcc49d8e6f5362602d26ef9f4af5ad091e70a Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Thu, 9 Oct 2025 13:35:09 -0700 Subject: [PATCH 018/133] remove grafana admin role assignment --- azure-managed-grafana.bicep | 17 ---------- azure-pipelines-managed-grafana.yml | 2 -- eng/deploy-managed-grafana.yml | 7 +--- eng/provision-grafana.yaml | 50 ++--------------------------- 4 files changed, 3 insertions(+), 73 deletions(-) diff --git a/azure-managed-grafana.bicep b/azure-managed-grafana.bicep index 537dcb51d..21d97e5ec 100644 --- a/azure-managed-grafana.bicep +++ b/azure-managed-grafana.bicep @@ -12,12 +12,6 @@ param grafanaWorkspaceName string ]) param skuName string = 'Standard' -@description('Object ID of the .NET Eng Services Azure AD group') -param dotnetEngServicesGroupObjectId string = '' - -@description('Whether to create role assignment for .NET Eng Services group') -param createRoleAssignment bool = true - // Azure Managed Grafana Workspace resource grafanaWorkspace 'Microsoft.Dashboard/grafana@2023-09-01' = { name: grafanaWorkspaceName @@ -40,17 +34,6 @@ resource grafanaWorkspace 'Microsoft.Dashboard/grafana@2023-09-01' = { } } -// Role assignment to grant .NET Eng Services group Grafana Admin access -resource grafanaAdminRoleAssignment 'Microsoft.Authorization/roleAssignments@2022-04-01' = if (createRoleAssignment && !empty(dotnetEngServicesGroupObjectId)) { - name: guid(grafanaWorkspace.id, dotnetEngServicesGroupObjectId, 'Grafana Admin') - scope: grafanaWorkspace - properties: { - roleDefinitionId: subscriptionResourceId('Microsoft.Authorization/roleDefinitions', '22926164-76b3-42b3-bc55-97df8dab3e41') // Grafana Admin role - principalId: dotnetEngServicesGroupObjectId - principalType: 'Group' - } -} - // Output the Grafana workspace details output grafanaWorkspaceId string = grafanaWorkspace.id output grafanaWorkspaceName string = grafanaWorkspace.name diff --git a/azure-pipelines-managed-grafana.yml b/azure-pipelines-managed-grafana.yml index 6302a1bf2..b57c1cf5b 100644 --- a/azure-pipelines-managed-grafana.yml +++ b/azure-pipelines-managed-grafana.yml @@ -37,7 +37,6 @@ extends: GrafanaVariableGroup: Dnceng-Managed-Grafana-Staging-Vg ServiceConnectionClientId: 4ad9ae35-2d42-4245-a954-9003b7e31349 ServiceConnectionId: f955b932-c7e3-48f7-9d67-4e6542b3568a - DotnetEngServicesGroupObjectId: "65d7fc1d-2744-4669-8779-5cd7d7a6b95b" ${{ else }}: DeploymentEnvironment: Production ServiceConnectionName: dnceng-managed-grafana @@ -46,6 +45,5 @@ extends: GrafanaVariableGroup: Dnceng-Managed-Grafana-Vg ServiceConnectionClientId: 0ceeca1a-31e7-49ee-9bf4-15f14ed28fa4 ServiceConnectionId: 332b249e-769b-49a9-9dc9-d82afe28ec0a - DotnetEngServicesGroupObjectId: "65d7fc1d-2744-4669-8779-5cd7d7a6b95b" diff --git a/eng/deploy-managed-grafana.yml b/eng/deploy-managed-grafana.yml index 7d93b679a..777095736 100644 --- a/eng/deploy-managed-grafana.yml +++ b/eng/deploy-managed-grafana.yml @@ -13,10 +13,6 @@ parameters: type: string - name: GrafanaVariableGroup type: string -- name: DotnetEngServicesGroupObjectId - type: string - default: "65d7fc1d-2744-4669-8779-5cd7d7a6b95b" - stages: - stage: ProvisionGrafana @@ -30,5 +26,4 @@ stages: GrafanaWorkspaceName: ${{ parameters.GrafanaWorkspaceName }} GrafanaLocation: 'westus2' GrafanaKeyVault: ${{ parameters.GrafanaKeyVault }} - GrafanaVariableGroup: ${{ parameters.GrafanaVariableGroup }} - DotnetEngServicesGroupObjectId: ${{ parameters.DotnetEngServicesGroupObjectId }} \ No newline at end of file + GrafanaVariableGroup: ${{ parameters.GrafanaVariableGroup }} \ No newline at end of file diff --git a/eng/provision-grafana.yaml b/eng/provision-grafana.yaml index 82850c8b8..548953b0a 100644 --- a/eng/provision-grafana.yaml +++ b/eng/provision-grafana.yaml @@ -23,9 +23,6 @@ parameters: - name: GrafanaVariableGroup type: string -- name: DotnetEngServicesGroupObjectId - type: string - - name: SkipGrafanaProvisioning type: boolean default: false @@ -96,7 +93,7 @@ jobs: location: '${{ parameters.GrafanaLocation }}' templateLocation: 'Linked artifact' csmFile: 'azure-managed-grafana.bicep' - overrideParameters: '-location "${{ parameters.GrafanaLocation }}" -grafanaWorkspaceName "${{ parameters.GrafanaWorkspaceName }}" -skuName "Standard" -dotnetEngServicesGroupObjectId "${{ parameters.DotnetEngServicesGroupObjectId }}" -createRoleAssignment false' + overrideParameters: '-location "${{ parameters.GrafanaLocation }}" -grafanaWorkspaceName "${{ parameters.GrafanaWorkspaceName }}" -skuName "Standard"' deploymentMode: 'Incremental' deploymentName: 'grafana-${{ parameters.DeploymentEnvironment }}-$(Build.BuildNumber)' deploymentOutputs: 'grafanaOutputs' @@ -174,47 +171,4 @@ jobs: Write-Host "##vso[task.setvariable variable=GrafanaPrincipalId;isOutput=true]$($workspace.identity.principalId)" Write-Host "##vso[task.setvariable variable=GrafanaResourceId;isOutput=true]$($workspace.id)" - Write-Host "SUCCESS: ${{ parameters.DeploymentEnvironment }} Grafana deployment verification completed" - - - task: AzureCLI@2 - displayName: 'Assign Grafana Admin Role to .NET Eng Services' - continueOnError: true - inputs: - azureSubscription: '${{ parameters.ServiceConnectionName }}' - scriptType: 'ps' - scriptLocation: 'inlineScript' - inlineScript: | - $workspaceName = "${{ parameters.GrafanaWorkspaceName }}" - $rgName = "${{ parameters.GrafanaResourceGroup }}" - $groupObjectId = "${{ parameters.DotnetEngServicesGroupObjectId }}" - - Write-Host "Attempting to assign Grafana Admin role to .NET Eng Services group..." - - # Get the Grafana workspace resource ID - $subscriptionId = az account show --query id --output tsv - $resourceId = "/subscriptions/$subscriptionId/resourceGroups/$rgName/providers/Microsoft.Dashboard/grafana/$workspaceName" - - # Check if role assignment already exists - $existingAssignment = az role assignment list --scope $resourceId --assignee $groupObjectId --role "Grafana Admin" --query "[0].id" --output tsv 2>$null - - if ($existingAssignment) { - Write-Host "SUCCESS: .NET Eng Services group already has Grafana Admin role" - } else { - # Try to assign the role - Write-Host "Assigning Grafana Admin role to .NET Eng Services group ($groupObjectId)..." - az role assignment create --assignee $groupObjectId --role "Grafana Admin" --scope $resourceId 2>$null - - if ($LASTEXITCODE -eq 0) { - Write-Host "SUCCESS: Grafana Admin role assigned to .NET Eng Services group" - } else { - Write-Host "WARNING: Failed to assign Grafana Admin role automatically" - Write-Host "Manual assignment required:" - Write-Host " 1. Go to Azure Portal > Resource Groups > $rgName > $workspaceName" - Write-Host " 2. Access control (IAM) > Add role assignment" - Write-Host " 3. Role: Grafana Admin" - Write-Host " 4. Assign access to: Group" - Write-Host " 5. Select: .NET Eng Services group" - } - } - - \ No newline at end of file + Write-Host "SUCCESS: ${{ parameters.DeploymentEnvironment }} Grafana deployment verification completed" \ No newline at end of file From eed1aed0aacf6ad48aca23612ec10f903a5fb6ba Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Thu, 9 Oct 2025 15:23:33 -0700 Subject: [PATCH 019/133] add release job type --- azure-pipelines-managed-grafana.yml | 2 +- eng/provision-grafana.yaml | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/azure-pipelines-managed-grafana.yml b/azure-pipelines-managed-grafana.yml index b57c1cf5b..51bd53271 100644 --- a/azure-pipelines-managed-grafana.yml +++ b/azure-pipelines-managed-grafana.yml @@ -32,7 +32,7 @@ extends: ${{ if ne(variables['Build.SourceBranch'], 'refs/heads/production') }}: DeploymentEnvironment: Staging ServiceConnectionName: dnceng-managed-grafana-staging - GrafanaWorkspaceName: dnceng-grafana-staging1 + GrafanaWorkspaceName: dnceng-grafana-staging GrafanaKeyVault: dnceng-grafana-int-kv GrafanaVariableGroup: Dnceng-Managed-Grafana-Staging-Vg ServiceConnectionClientId: 4ad9ae35-2d42-4245-a954-9003b7e31349 diff --git a/eng/provision-grafana.yaml b/eng/provision-grafana.yaml index 548953b0a..493529052 100644 --- a/eng/provision-grafana.yaml +++ b/eng/provision-grafana.yaml @@ -33,6 +33,9 @@ jobs: pool: name: NetCore1ESPool-Internal demands: ImageOverride -equals 1es-windows-2022 + templateContext: + type: releaseJob + isProduction: false variables: - group: ${{ parameters.GrafanaVariableGroup }} From 90cdefc1b08abc3ede7d439f9eafbbcedda4f24c Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Thu, 9 Oct 2025 15:32:29 -0700 Subject: [PATCH 020/133] remove release job type --- eng/provision-grafana.yaml | 3 --- 1 file changed, 3 deletions(-) diff --git a/eng/provision-grafana.yaml b/eng/provision-grafana.yaml index 493529052..548953b0a 100644 --- a/eng/provision-grafana.yaml +++ b/eng/provision-grafana.yaml @@ -33,9 +33,6 @@ jobs: pool: name: NetCore1ESPool-Internal demands: ImageOverride -equals 1es-windows-2022 - templateContext: - type: releaseJob - isProduction: false variables: - group: ${{ parameters.GrafanaVariableGroup }} From f5337974e3d74143ece2a43891ad361f9733b4e1 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Mon, 13 Oct 2025 11:35:05 -0700 Subject: [PATCH 021/133] provision azure managed grafana workspace --- azure-pipelines.yml | 22 ++++++++++++++++++- .../deployment/azure-managed-grafana.bicep | 0 2 files changed, 21 insertions(+), 1 deletion(-) rename azure-managed-grafana.bicep => eng/deployment/azure-managed-grafana.bicep (100%) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index f14b3978b..1b70f57ab 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -252,4 +252,24 @@ extends: GrafanaKeyVault: dotnet-grafana GrafanaVariableGroup: Dotnet-Grafana-Production ServiceConnectionClientId: fc1eb341-aea4-4a11-8f80-d14b8775b2ba - ServiceConnectionId: 4a511f6f-b538-48e6-a389-207e430634d1 \ No newline at end of file + ServiceConnectionId: 4a511f6f-b538-48e6-a389-207e430634d1 + + - ${{ if in(variables['Build.SourceBranch'], 'refs/heads/haruna/managed-grafana-new', 'refs/heads/production')}}: + - template: /eng/deploy-managed-grafana.yml@self + parameters: + ${{ if ne(variables['Build.SourceBranch'], 'refs/heads/production') }}: + DeploymentEnvironment: Staging + ServiceConnectionName: dnceng-managed-grafana-staging + GrafanaWorkspaceName: dnceng-grafana-staging + GrafanaKeyVault: dnceng-grafana-int-kv + GrafanaVariableGroup: Dnceng-Managed-Grafana-Staging-Vg + ServiceConnectionClientId: 4ad9ae35-2d42-4245-a954-9003b7e31349 + ServiceConnectionId: f955b932-c7e3-48f7-9d67-4e6542b3568a + ${{ else }}: + DeploymentEnvironment: Production + ServiceConnectionName: dnceng-managed-grafana + GrafanaWorkspaceName: dnceng-grafana + GrafanaKeyVault: dnceng-grafana-prod-kv + GrafanaVariableGroup: Dnceng-Managed-Grafana-Vg + ServiceConnectionClientId: 0ceeca1a-31e7-49ee-9bf4-15f14ed28fa4 + ServiceConnectionId: 332b249e-769b-49a9-9dc9-d82afe28ec0a \ No newline at end of file diff --git a/azure-managed-grafana.bicep b/eng/deployment/azure-managed-grafana.bicep similarity index 100% rename from azure-managed-grafana.bicep rename to eng/deployment/azure-managed-grafana.bicep From a074e783d77856e7ab722cc772145b26e51b5e24 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Mon, 13 Oct 2025 11:53:53 -0700 Subject: [PATCH 022/133] fix bicep file path --- azure-pipelines.yml | 1 - eng/provision-grafana.yaml | 8 ++++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 1b70f57ab..a8348fd4d 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -254,7 +254,6 @@ extends: ServiceConnectionClientId: fc1eb341-aea4-4a11-8f80-d14b8775b2ba ServiceConnectionId: 4a511f6f-b538-48e6-a389-207e430634d1 - - ${{ if in(variables['Build.SourceBranch'], 'refs/heads/haruna/managed-grafana-new', 'refs/heads/production')}}: - template: /eng/deploy-managed-grafana.yml@self parameters: ${{ if ne(variables['Build.SourceBranch'], 'refs/heads/production') }}: diff --git a/eng/provision-grafana.yaml b/eng/provision-grafana.yaml index 548953b0a..72faafe97 100644 --- a/eng/provision-grafana.yaml +++ b/eng/provision-grafana.yaml @@ -49,11 +49,11 @@ jobs: scriptLocation: 'inlineScript' inlineScript: | Write-Host "Validating Grafana Bicep template..." - if (!(Test-Path "azure-managed-grafana.bicep")) { + if (!(Test-Path "eng/deployment/azure-managed-grafana.bicep")) { throw "Bicep template not found: azure-managed-grafana.bicep" } - - az bicep build --file azure-managed-grafana.bicep + + az bicep build --file eng/deployment/azure-managed-grafana.bicep if ($LASTEXITCODE -ne 0) { throw "Bicep template validation failed" } @@ -92,7 +92,7 @@ jobs: resourceGroupName: '${{ parameters.GrafanaResourceGroup }}' location: '${{ parameters.GrafanaLocation }}' templateLocation: 'Linked artifact' - csmFile: 'azure-managed-grafana.bicep' + csmFile: 'eng/deployment/azure-managed-grafana.bicep' overrideParameters: '-location "${{ parameters.GrafanaLocation }}" -grafanaWorkspaceName "${{ parameters.GrafanaWorkspaceName }}" -skuName "Standard"' deploymentMode: 'Incremental' deploymentName: 'grafana-${{ parameters.DeploymentEnvironment }}-$(Build.BuildNumber)' From 7fc2c347fe9f4c27d4911121ea620236dab956b9 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Mon, 13 Oct 2025 15:17:49 -0700 Subject: [PATCH 023/133] add provsion grafana stage to the deployment --- eng/deploy-managed-grafana.yml | 3 +++ eng/deploy.yaml | 2 ++ 2 files changed, 5 insertions(+) diff --git a/eng/deploy-managed-grafana.yml b/eng/deploy-managed-grafana.yml index 777095736..15b7fd166 100644 --- a/eng/deploy-managed-grafana.yml +++ b/eng/deploy-managed-grafana.yml @@ -17,6 +17,9 @@ parameters: stages: - stage: ProvisionGrafana displayName: 'Provision Grafana Infrastructure' + dependsOn: + - predeploy + - approval jobs: - template: /eng/provision-grafana.yaml@self parameters: diff --git a/eng/deploy.yaml b/eng/deploy.yaml index efd8a04cf..3e18cd66a 100644 --- a/eng/deploy.yaml +++ b/eng/deploy.yaml @@ -156,6 +156,7 @@ stages: demands: ImageOverride -equals 1es-windows-2019 dependsOn: - deploy + - ProvisionGrafana variables: - group: ${{ parameters.StatusVariableGroup }} - group: ${{ parameters.GrafanaVariableGroup }} @@ -201,6 +202,7 @@ stages: demands: ImageOverride -equals 1es-windows-2019 dependsOn: - deploy + - ProvisionGrafana jobs: - job: scenario displayName: Scenario tests From b5ea9cae1516cc3418ec5725094f5827c4aa419d Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Mon, 13 Oct 2025 17:15:19 -0700 Subject: [PATCH 024/133] add deploy azure managed grafana script --- eng/deployment/deploy-grafana.ps1 | 141 ++++++++++++++++++++++++++++++ 1 file changed, 141 insertions(+) create mode 100644 eng/deployment/deploy-grafana.ps1 diff --git a/eng/deployment/deploy-grafana.ps1 b/eng/deployment/deploy-grafana.ps1 new file mode 100644 index 000000000..f41b7efe8 --- /dev/null +++ b/eng/deployment/deploy-grafana.ps1 @@ -0,0 +1,141 @@ +# Azure Managed Grafana Deployment Script +# This script deploys an Azure Managed Grafana workspace using Bicep + +param( + [Parameter(Mandatory = $true)] + [string]$SubscriptionId, + + [Parameter(Mandatory = $true)] + [string]$ResourceGroupName, + + [Parameter(Mandatory = $true)] + [string]$Location, + + [Parameter(Mandatory = $true)] + [string]$GrafanaWorkspaceName, + + [Parameter(Mandatory = $false)] + [string]$DeploymentName = "grafana-deployment-$(Get-Date -Format 'yyyyMMdd-HHmmss')", + + [Parameter(Mandatory = $false)] + [switch]$WhatIf = $false +) + +# Set error action preference +$ErrorActionPreference = "Stop" + +Write-Host "=======================================" -ForegroundColor Cyan +Write-Host "Azure Managed Grafana Deployment Script" -ForegroundColor Cyan +Write-Host "=======================================" -ForegroundColor Cyan + +try { + # Check if Azure CLI is installed + Write-Host "Checking Azure CLI installation..." -ForegroundColor Yellow + az version 2>$null | Out-Null + if ($LASTEXITCODE -ne 0) { + throw "Azure CLI is not installed or not in PATH. Please install Azure CLI first." + } + Write-Host "āœ“ Azure CLI is installed" -ForegroundColor Green + + # Check if user is logged in + Write-Host "Checking Azure authentication..." -ForegroundColor Yellow + $account = az account show 2>$null | ConvertFrom-Json + if ($LASTEXITCODE -ne 0) { + Write-Host "Not logged in to Azure. Please login..." -ForegroundColor Yellow + az login + if ($LASTEXITCODE -ne 0) { + throw "Failed to login to Azure" + } + } + Write-Host "āœ“ Authenticated as: $($account.user.name)" -ForegroundColor Green + + # Set the subscription + Write-Host "Setting subscription to: $SubscriptionId" -ForegroundColor Yellow + az account set --subscription $SubscriptionId + if ($LASTEXITCODE -ne 0) { + throw "Failed to set subscription. Please check if the subscription ID is correct and you have access." + } + Write-Host "āœ“ Subscription set successfully" -ForegroundColor Green + + # Check if resource group exists, create if it doesn't + Write-Host "Checking if resource group '$ResourceGroupName' exists..." -ForegroundColor Yellow + az group show --name $ResourceGroupName 2>$null | Out-Null + if ($LASTEXITCODE -ne 0) { + Write-Host "Resource group doesn't exist. Creating..." -ForegroundColor Yellow + az group create --name $ResourceGroupName --location $Location + if ($LASTEXITCODE -ne 0) { + throw "Failed to create resource group" + } + Write-Host "āœ“ Resource group created successfully" -ForegroundColor Green + } else { + Write-Host "āœ“ Resource group already exists" -ForegroundColor Green + } + + # Get the Bicep file path + $bicepFile = Join-Path $PSScriptRoot "azure-managed-grafana.bicep" + if (!(Test-Path $bicepFile)) { + throw "Bicep file not found at: $bicepFile" + } + Write-Host "āœ“ Bicep file found: $bicepFile" -ForegroundColor Green + + # Prepare deployment parameters + $parameters = @{ + location = $Location + grafanaWorkspaceName = $GrafanaWorkspaceName + skuName = "Standard" + } + + # Convert parameters to string format for Azure CLI + $paramString = ($parameters.GetEnumerator() | ForEach-Object { "$($_.Key)=`"$($_.Value)`"" }) -join " " + + # Run deployment + if ($WhatIf) { + Write-Host "Running what-if deployment..." -ForegroundColor Yellow + $cmd = "az deployment group what-if --resource-group $ResourceGroupName --template-file `"$bicepFile`" --parameters $paramString" + Write-Host "Command: $cmd" -ForegroundColor Gray + Invoke-Expression $cmd + } else { + Write-Host "Starting deployment..." -ForegroundColor Yellow + Write-Host "Deployment name: $DeploymentName" -ForegroundColor Gray + Write-Host "Resource group: $ResourceGroupName" -ForegroundColor Gray + Write-Host "Grafana workspace name: $GrafanaWorkspaceName" -ForegroundColor Gray + + $cmd = "az deployment group create --resource-group $ResourceGroupName --name $DeploymentName --template-file `"$bicepFile`" --parameters $paramString" + Write-Host "Command: $cmd" -ForegroundColor Gray + + $result = Invoke-Expression $cmd | ConvertFrom-Json + + if ($LASTEXITCODE -eq 0) { + Write-Host "=======================================" -ForegroundColor Green + Write-Host "āœ“ Deployment completed successfully!" -ForegroundColor Green + Write-Host "=======================================" -ForegroundColor Green + + # Display outputs + if ($result.properties.outputs) { + Write-Host "Deployment Outputs:" -ForegroundColor Cyan + $result.properties.outputs | ConvertTo-Json -Depth 3 | Write-Host + } + + # Get the Grafana workspace details + Write-Host "`nGrafana Workspace Details:" -ForegroundColor Cyan + $grafana = az grafana show --name $GrafanaWorkspaceName --resource-group $ResourceGroupName | ConvertFrom-Json + Write-Host "Workspace Name: $($grafana.name)" -ForegroundColor White + Write-Host "Workspace URL: $($grafana.properties.endpoint)" -ForegroundColor White + Write-Host "Location: $($grafana.location)" -ForegroundColor White + Write-Host "SKU: $($grafana.sku.name)" -ForegroundColor White + Write-Host "System Managed Identity: $($grafana.identity.principalId)" -ForegroundColor White + } else { + throw "Deployment failed" + } + } +} +catch { + Write-Host "=======================================" -ForegroundColor Red + Write-Host "āŒ Error occurred during deployment:" -ForegroundColor Red + Write-Host $_.Exception.Message -ForegroundColor Red + Write-Host "=======================================" -ForegroundColor Red + exit 1 +} + +Write-Host "`nšŸŽ‰ Script completed successfully!" -ForegroundColor Green +Write-Host "You can now access your Grafana workspace and configure it as needed." -ForegroundColor Yellow From 6ff2dba12422300c3030bae32c569776edd600ec Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Tue, 14 Oct 2025 11:26:28 -0700 Subject: [PATCH 025/133] remove test pipeline --- azure-pipelines-managed-grafana.yml | 49 ----------------------------- 1 file changed, 49 deletions(-) delete mode 100644 azure-pipelines-managed-grafana.yml diff --git a/azure-pipelines-managed-grafana.yml b/azure-pipelines-managed-grafana.yml deleted file mode 100644 index 51bd53271..000000000 --- a/azure-pipelines-managed-grafana.yml +++ /dev/null @@ -1,49 +0,0 @@ -trigger: - batch: true - branches: - include: - - haruna/managed-grafana-new - - production -pr: none - -resources: - repositories: - - repository: 1ESPipelineTemplates - type: git - name: 1ESPipelineTemplates/1ESPipelineTemplates - ref: refs/tags/release -extends: - template: v1/1ES.Official.PipelineTemplate.yml@1ESPipelineTemplates - parameters: - pool: - name: NetCore1ESPool-Internal - image: 1es-windows-2022 - os: windows - sdl: - policheck: - enabled: true - tsa: - enabled: true - - stages: - - ${{ if in(variables['Build.SourceBranch'], 'refs/heads/haruna/managed-grafana-new', 'refs/heads/production')}}: - - template: /eng/deploy-managed-grafana.yml@self - parameters: - ${{ if ne(variables['Build.SourceBranch'], 'refs/heads/production') }}: - DeploymentEnvironment: Staging - ServiceConnectionName: dnceng-managed-grafana-staging - GrafanaWorkspaceName: dnceng-grafana-staging - GrafanaKeyVault: dnceng-grafana-int-kv - GrafanaVariableGroup: Dnceng-Managed-Grafana-Staging-Vg - ServiceConnectionClientId: 4ad9ae35-2d42-4245-a954-9003b7e31349 - ServiceConnectionId: f955b932-c7e3-48f7-9d67-4e6542b3568a - ${{ else }}: - DeploymentEnvironment: Production - ServiceConnectionName: dnceng-managed-grafana - GrafanaWorkspaceName: dnceng-grafana - GrafanaKeyVault: dnceng-grafana-prod-kv - GrafanaVariableGroup: Dnceng-Managed-Grafana-Vg - ServiceConnectionClientId: 0ceeca1a-31e7-49ee-9bf4-15f14ed28fa4 - ServiceConnectionId: 332b249e-769b-49a9-9dc9-d82afe28ec0a - - From 8106d3b198b7074ef783d033011fef520e8bd672 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Wed, 15 Oct 2025 11:49:29 -0700 Subject: [PATCH 026/133] delete deploy-grafana.ps1 file --- eng/deployment/deploy-grafana.ps1 | 141 ------------------------------ 1 file changed, 141 deletions(-) delete mode 100644 eng/deployment/deploy-grafana.ps1 diff --git a/eng/deployment/deploy-grafana.ps1 b/eng/deployment/deploy-grafana.ps1 deleted file mode 100644 index f41b7efe8..000000000 --- a/eng/deployment/deploy-grafana.ps1 +++ /dev/null @@ -1,141 +0,0 @@ -# Azure Managed Grafana Deployment Script -# This script deploys an Azure Managed Grafana workspace using Bicep - -param( - [Parameter(Mandatory = $true)] - [string]$SubscriptionId, - - [Parameter(Mandatory = $true)] - [string]$ResourceGroupName, - - [Parameter(Mandatory = $true)] - [string]$Location, - - [Parameter(Mandatory = $true)] - [string]$GrafanaWorkspaceName, - - [Parameter(Mandatory = $false)] - [string]$DeploymentName = "grafana-deployment-$(Get-Date -Format 'yyyyMMdd-HHmmss')", - - [Parameter(Mandatory = $false)] - [switch]$WhatIf = $false -) - -# Set error action preference -$ErrorActionPreference = "Stop" - -Write-Host "=======================================" -ForegroundColor Cyan -Write-Host "Azure Managed Grafana Deployment Script" -ForegroundColor Cyan -Write-Host "=======================================" -ForegroundColor Cyan - -try { - # Check if Azure CLI is installed - Write-Host "Checking Azure CLI installation..." -ForegroundColor Yellow - az version 2>$null | Out-Null - if ($LASTEXITCODE -ne 0) { - throw "Azure CLI is not installed or not in PATH. Please install Azure CLI first." - } - Write-Host "āœ“ Azure CLI is installed" -ForegroundColor Green - - # Check if user is logged in - Write-Host "Checking Azure authentication..." -ForegroundColor Yellow - $account = az account show 2>$null | ConvertFrom-Json - if ($LASTEXITCODE -ne 0) { - Write-Host "Not logged in to Azure. Please login..." -ForegroundColor Yellow - az login - if ($LASTEXITCODE -ne 0) { - throw "Failed to login to Azure" - } - } - Write-Host "āœ“ Authenticated as: $($account.user.name)" -ForegroundColor Green - - # Set the subscription - Write-Host "Setting subscription to: $SubscriptionId" -ForegroundColor Yellow - az account set --subscription $SubscriptionId - if ($LASTEXITCODE -ne 0) { - throw "Failed to set subscription. Please check if the subscription ID is correct and you have access." - } - Write-Host "āœ“ Subscription set successfully" -ForegroundColor Green - - # Check if resource group exists, create if it doesn't - Write-Host "Checking if resource group '$ResourceGroupName' exists..." -ForegroundColor Yellow - az group show --name $ResourceGroupName 2>$null | Out-Null - if ($LASTEXITCODE -ne 0) { - Write-Host "Resource group doesn't exist. Creating..." -ForegroundColor Yellow - az group create --name $ResourceGroupName --location $Location - if ($LASTEXITCODE -ne 0) { - throw "Failed to create resource group" - } - Write-Host "āœ“ Resource group created successfully" -ForegroundColor Green - } else { - Write-Host "āœ“ Resource group already exists" -ForegroundColor Green - } - - # Get the Bicep file path - $bicepFile = Join-Path $PSScriptRoot "azure-managed-grafana.bicep" - if (!(Test-Path $bicepFile)) { - throw "Bicep file not found at: $bicepFile" - } - Write-Host "āœ“ Bicep file found: $bicepFile" -ForegroundColor Green - - # Prepare deployment parameters - $parameters = @{ - location = $Location - grafanaWorkspaceName = $GrafanaWorkspaceName - skuName = "Standard" - } - - # Convert parameters to string format for Azure CLI - $paramString = ($parameters.GetEnumerator() | ForEach-Object { "$($_.Key)=`"$($_.Value)`"" }) -join " " - - # Run deployment - if ($WhatIf) { - Write-Host "Running what-if deployment..." -ForegroundColor Yellow - $cmd = "az deployment group what-if --resource-group $ResourceGroupName --template-file `"$bicepFile`" --parameters $paramString" - Write-Host "Command: $cmd" -ForegroundColor Gray - Invoke-Expression $cmd - } else { - Write-Host "Starting deployment..." -ForegroundColor Yellow - Write-Host "Deployment name: $DeploymentName" -ForegroundColor Gray - Write-Host "Resource group: $ResourceGroupName" -ForegroundColor Gray - Write-Host "Grafana workspace name: $GrafanaWorkspaceName" -ForegroundColor Gray - - $cmd = "az deployment group create --resource-group $ResourceGroupName --name $DeploymentName --template-file `"$bicepFile`" --parameters $paramString" - Write-Host "Command: $cmd" -ForegroundColor Gray - - $result = Invoke-Expression $cmd | ConvertFrom-Json - - if ($LASTEXITCODE -eq 0) { - Write-Host "=======================================" -ForegroundColor Green - Write-Host "āœ“ Deployment completed successfully!" -ForegroundColor Green - Write-Host "=======================================" -ForegroundColor Green - - # Display outputs - if ($result.properties.outputs) { - Write-Host "Deployment Outputs:" -ForegroundColor Cyan - $result.properties.outputs | ConvertTo-Json -Depth 3 | Write-Host - } - - # Get the Grafana workspace details - Write-Host "`nGrafana Workspace Details:" -ForegroundColor Cyan - $grafana = az grafana show --name $GrafanaWorkspaceName --resource-group $ResourceGroupName | ConvertFrom-Json - Write-Host "Workspace Name: $($grafana.name)" -ForegroundColor White - Write-Host "Workspace URL: $($grafana.properties.endpoint)" -ForegroundColor White - Write-Host "Location: $($grafana.location)" -ForegroundColor White - Write-Host "SKU: $($grafana.sku.name)" -ForegroundColor White - Write-Host "System Managed Identity: $($grafana.identity.principalId)" -ForegroundColor White - } else { - throw "Deployment failed" - } - } -} -catch { - Write-Host "=======================================" -ForegroundColor Red - Write-Host "āŒ Error occurred during deployment:" -ForegroundColor Red - Write-Host $_.Exception.Message -ForegroundColor Red - Write-Host "=======================================" -ForegroundColor Red - exit 1 -} - -Write-Host "`nšŸŽ‰ Script completed successfully!" -ForegroundColor Green -Write-Host "You can now access your Grafana workspace and configure it as needed." -ForegroundColor Yellow From 7e7134730e852572550f379c350e38d640d4a923 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Wed, 15 Oct 2025 11:50:57 -0700 Subject: [PATCH 027/133] add grafana bicep validation task to the pr jobs --- azure-pipelines-pr.yml | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/azure-pipelines-pr.yml b/azure-pipelines-pr.yml index aba20628e..29e8692a9 100644 --- a/azure-pipelines-pr.yml +++ b/azure-pipelines-pr.yml @@ -6,6 +6,12 @@ pr: - main - production +variables: + ${{ if eq(variables['System.PullRequest.TargetBranch'], 'refs/heads/production') }}: + ServiceConnectionName: 'dnceng-managed-grafana' + ${{ else }}: + ServiceConnectionName: 'dnceng-managed-grafana-staging' + stages: - stage: build dependsOn: [] @@ -105,4 +111,22 @@ stages: dotnet run --project src/SecretManager/Microsoft.DncEng.SecretManager -- validate-all -b src @manifestArgs displayName: Verify Secret Usages - - template: /eng/test.yaml \ No newline at end of file + - template: /eng/test.yaml + + - task: AzureCLI@2 + displayName: 'Validate Grafana Bicep Template' + inputs: + azureSubscription: '$(ServiceConnectionName)' + scriptType: 'ps' + scriptLocation: 'inlineScript' + inlineScript: | + Write-Host "Validating Grafana Bicep template..." + if (!(Test-Path "eng/deployment/azure-managed-grafana.bicep")) { + throw "Bicep template not found: azure-managed-grafana.bicep" + } + + az bicep build --file eng/deployment/azure-managed-grafana.bicep + if ($LASTEXITCODE -ne 0) { + throw "Bicep template validation failed" + } + Write-Host "SUCCESS: Bicep template validation successful" \ No newline at end of file From 2c39d250c1673e8baaf097ff9fc721fa03379c93 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Wed, 15 Oct 2025 12:07:26 -0700 Subject: [PATCH 028/133] add grafana bicep validation to the build stage --- azure-pipelines.yml | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index a8348fd4d..ae87e42e7 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -206,6 +206,24 @@ extends: contents: '*' targetFolder: $(Build.ArtifactStagingDirectory)\eng + - task: AzureCLI@2 + displayName: 'Validate Grafana Bicep Template' + inputs: + azureSubscription: '$(ServiceConnectionName)' + scriptType: 'ps' + scriptLocation: 'inlineScript' + inlineScript: | + Write-Host "Validating Grafana Bicep template..." + if (!(Test-Path "eng/deployment/azure-managed-grafana.bicep")) { + throw "Bicep template not found: azure-managed-grafana.bicep" + } + + az bicep build --file eng/deployment/azure-managed-grafana.bicep + if ($LASTEXITCODE -ne 0) { + throw "Bicep template validation failed" + } + Write-Host "SUCCESS: Bicep template validation successful" + - template: /eng/common/templates-official/post-build/post-build.yml@self parameters: enableSymbolValidation: false From 4d2041c9b79cc978bc25fb7f96c0cd319fead178 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Wed, 15 Oct 2025 12:12:16 -0700 Subject: [PATCH 029/133] remove unused GrafanaKeyVault parameter --- azure-pipelines.yml | 2 -- eng/deploy-managed-grafana.yml | 3 --- eng/provision-grafana.yaml | 3 --- 3 files changed, 8 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index ae87e42e7..6a0c6a2ce 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -278,7 +278,6 @@ extends: DeploymentEnvironment: Staging ServiceConnectionName: dnceng-managed-grafana-staging GrafanaWorkspaceName: dnceng-grafana-staging - GrafanaKeyVault: dnceng-grafana-int-kv GrafanaVariableGroup: Dnceng-Managed-Grafana-Staging-Vg ServiceConnectionClientId: 4ad9ae35-2d42-4245-a954-9003b7e31349 ServiceConnectionId: f955b932-c7e3-48f7-9d67-4e6542b3568a @@ -286,7 +285,6 @@ extends: DeploymentEnvironment: Production ServiceConnectionName: dnceng-managed-grafana GrafanaWorkspaceName: dnceng-grafana - GrafanaKeyVault: dnceng-grafana-prod-kv GrafanaVariableGroup: Dnceng-Managed-Grafana-Vg ServiceConnectionClientId: 0ceeca1a-31e7-49ee-9bf4-15f14ed28fa4 ServiceConnectionId: 332b249e-769b-49a9-9dc9-d82afe28ec0a \ No newline at end of file diff --git a/eng/deploy-managed-grafana.yml b/eng/deploy-managed-grafana.yml index 15b7fd166..5bb250cdb 100644 --- a/eng/deploy-managed-grafana.yml +++ b/eng/deploy-managed-grafana.yml @@ -9,8 +9,6 @@ parameters: type: string - name: GrafanaWorkspaceName type: string -- name: GrafanaKeyVault - type: string - name: GrafanaVariableGroup type: string @@ -28,5 +26,4 @@ stages: GrafanaResourceGroup: 'monitoring-managed' GrafanaWorkspaceName: ${{ parameters.GrafanaWorkspaceName }} GrafanaLocation: 'westus2' - GrafanaKeyVault: ${{ parameters.GrafanaKeyVault }} GrafanaVariableGroup: ${{ parameters.GrafanaVariableGroup }} \ No newline at end of file diff --git a/eng/provision-grafana.yaml b/eng/provision-grafana.yaml index 72faafe97..411efb8b8 100644 --- a/eng/provision-grafana.yaml +++ b/eng/provision-grafana.yaml @@ -17,9 +17,6 @@ parameters: - name: GrafanaLocation type: string -- name: GrafanaKeyVault - type: string - - name: GrafanaVariableGroup type: string From 758a85cc8c71093c681f6e9566eba1d6fc72936e Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Wed, 15 Oct 2025 12:18:34 -0700 Subject: [PATCH 030/133] remove unused GrafanaVariableGroup parameter --- azure-pipelines.yml | 2 -- eng/deploy-managed-grafana.yml | 3 --- eng/provision-grafana.yaml | 6 ------ 3 files changed, 11 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 6a0c6a2ce..42a63b3bc 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -278,13 +278,11 @@ extends: DeploymentEnvironment: Staging ServiceConnectionName: dnceng-managed-grafana-staging GrafanaWorkspaceName: dnceng-grafana-staging - GrafanaVariableGroup: Dnceng-Managed-Grafana-Staging-Vg ServiceConnectionClientId: 4ad9ae35-2d42-4245-a954-9003b7e31349 ServiceConnectionId: f955b932-c7e3-48f7-9d67-4e6542b3568a ${{ else }}: DeploymentEnvironment: Production ServiceConnectionName: dnceng-managed-grafana GrafanaWorkspaceName: dnceng-grafana - GrafanaVariableGroup: Dnceng-Managed-Grafana-Vg ServiceConnectionClientId: 0ceeca1a-31e7-49ee-9bf4-15f14ed28fa4 ServiceConnectionId: 332b249e-769b-49a9-9dc9-d82afe28ec0a \ No newline at end of file diff --git a/eng/deploy-managed-grafana.yml b/eng/deploy-managed-grafana.yml index 5bb250cdb..a61f9bef6 100644 --- a/eng/deploy-managed-grafana.yml +++ b/eng/deploy-managed-grafana.yml @@ -9,8 +9,6 @@ parameters: type: string - name: GrafanaWorkspaceName type: string -- name: GrafanaVariableGroup - type: string stages: - stage: ProvisionGrafana @@ -26,4 +24,3 @@ stages: GrafanaResourceGroup: 'monitoring-managed' GrafanaWorkspaceName: ${{ parameters.GrafanaWorkspaceName }} GrafanaLocation: 'westus2' - GrafanaVariableGroup: ${{ parameters.GrafanaVariableGroup }} \ No newline at end of file diff --git a/eng/provision-grafana.yaml b/eng/provision-grafana.yaml index 411efb8b8..a8e9b53d7 100644 --- a/eng/provision-grafana.yaml +++ b/eng/provision-grafana.yaml @@ -17,9 +17,6 @@ parameters: - name: GrafanaLocation type: string -- name: GrafanaVariableGroup - type: string - - name: SkipGrafanaProvisioning type: boolean default: false @@ -31,9 +28,6 @@ jobs: name: NetCore1ESPool-Internal demands: ImageOverride -equals 1es-windows-2022 - variables: - - group: ${{ parameters.GrafanaVariableGroup }} - steps: - checkout: self displayName: 'Checkout Repository' From 71c3d730b4e3744004c25325fdc70ab07f1e22b2 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Wed, 15 Oct 2025 12:52:57 -0700 Subject: [PATCH 031/133] add skuName variable --- eng/provision-grafana.yaml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/eng/provision-grafana.yaml b/eng/provision-grafana.yaml index a8e9b53d7..d26e4672c 100644 --- a/eng/provision-grafana.yaml +++ b/eng/provision-grafana.yaml @@ -17,6 +17,10 @@ parameters: - name: GrafanaLocation type: string +- name: GrafanaSkuName + type: string + default: 'Standard' + - name: SkipGrafanaProvisioning type: boolean default: false @@ -84,7 +88,7 @@ jobs: location: '${{ parameters.GrafanaLocation }}' templateLocation: 'Linked artifact' csmFile: 'eng/deployment/azure-managed-grafana.bicep' - overrideParameters: '-location "${{ parameters.GrafanaLocation }}" -grafanaWorkspaceName "${{ parameters.GrafanaWorkspaceName }}" -skuName "Standard"' + overrideParameters: '-location "${{ parameters.GrafanaLocation }}" -grafanaWorkspaceName "${{ parameters.GrafanaWorkspaceName }}" -skuName "${{ parameters.GrafanaSkuName }}"' deploymentMode: 'Incremental' deploymentName: 'grafana-${{ parameters.DeploymentEnvironment }}-$(Build.BuildNumber)' deploymentOutputs: 'grafanaOutputs' From cf44564ce3808f91a2344f469027296016eaa6af Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Wed, 15 Oct 2025 14:56:20 -0700 Subject: [PATCH 032/133] remove unused SkipGrafanaProvisioning parameter --- eng/provision-grafana.yaml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/eng/provision-grafana.yaml b/eng/provision-grafana.yaml index d26e4672c..e46735f4b 100644 --- a/eng/provision-grafana.yaml +++ b/eng/provision-grafana.yaml @@ -21,10 +21,6 @@ parameters: type: string default: 'Standard' -- name: SkipGrafanaProvisioning - type: boolean - default: false - jobs: - job: ProvisionGrafana displayName: 'Provision Azure Managed Grafana' From 0e9664dfb2a50c363488ef63da593fb168c996dc Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Wed, 15 Oct 2025 15:00:25 -0700 Subject: [PATCH 033/133] reduce maximum attempts to 5 for grafana deployment --- eng/provision-grafana.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eng/provision-grafana.yaml b/eng/provision-grafana.yaml index e46735f4b..590868319 100644 --- a/eng/provision-grafana.yaml +++ b/eng/provision-grafana.yaml @@ -117,7 +117,7 @@ jobs: Write-Host "Verifying Grafana workspace deployment..." # Wait for deployment to complete - $maxAttempts = 30 + $maxAttempts = 5 $attempt = 0 do { $attempt++ From 4d387fe48e3c00cb85eb7365761e4257409366e4 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Wed, 15 Oct 2025 15:17:47 -0700 Subject: [PATCH 034/133] remove output variables since there is no downstream usage --- eng/provision-grafana.yaml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/eng/provision-grafana.yaml b/eng/provision-grafana.yaml index 590868319..bbda67023 100644 --- a/eng/provision-grafana.yaml +++ b/eng/provision-grafana.yaml @@ -157,9 +157,4 @@ jobs: Write-Host " No role assignments found" } - # Store outputs for downstream usage - Write-Host "##vso[task.setvariable variable=GrafanaUrl;isOutput=true]$($workspace.properties.endpoint)" - Write-Host "##vso[task.setvariable variable=GrafanaPrincipalId;isOutput=true]$($workspace.identity.principalId)" - Write-Host "##vso[task.setvariable variable=GrafanaResourceId;isOutput=true]$($workspace.id)" - Write-Host "SUCCESS: ${{ parameters.DeploymentEnvironment }} Grafana deployment verification completed" \ No newline at end of file From ed34f93dec7c06388d2f9ccfd716cd57c927fbce Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Wed, 15 Oct 2025 16:07:52 -0700 Subject: [PATCH 035/133] make the dotnet eng services group the grafana admin --- eng/deployment/azure-managed-grafana.bicep | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/eng/deployment/azure-managed-grafana.bicep b/eng/deployment/azure-managed-grafana.bicep index 21d97e5ec..1bf626a9a 100644 --- a/eng/deployment/azure-managed-grafana.bicep +++ b/eng/deployment/azure-managed-grafana.bicep @@ -12,6 +12,12 @@ param grafanaWorkspaceName string ]) param skuName string = 'Standard' +@description('The Azure AD Object ID of the .NET Eng Services group') +param dotnetEngServicesGroupId string = '65d7fc1d-2744-4669-8779-5cd7d7a6b95b' + +// Define the Grafana Admin role definition ID +var grafanaAdminRoleId = '22926164-76b3-42b3-bc55-97df8dab3e41' + // Azure Managed Grafana Workspace resource grafanaWorkspace 'Microsoft.Dashboard/grafana@2023-09-01' = { name: grafanaWorkspaceName @@ -34,6 +40,17 @@ resource grafanaWorkspace 'Microsoft.Dashboard/grafana@2023-09-01' = { } } +// Role assignment to grant Grafana Admin access to .NET Engineering Services group +resource grafanaAdminRoleAssignment 'Microsoft.Authorization/roleAssignments@2022-04-01' = { + name: guid(grafanaWorkspace.id, dotnetEngServicesGroupId, grafanaAdminRoleId) + scope: grafanaWorkspace + properties: { + roleDefinitionId: subscriptionResourceId('Microsoft.Authorization/roleDefinitions', grafanaAdminRoleId) + principalId: dotnetEngServicesGroupId + principalType: 'Group' + } +} + // Output the Grafana workspace details output grafanaWorkspaceId string = grafanaWorkspace.id output grafanaWorkspaceName string = grafanaWorkspace.name @@ -41,3 +58,4 @@ output grafanaWorkspaceUrl string = grafanaWorkspace.properties.endpoint output grafanaPrincipalId string = grafanaWorkspace.identity.principalId output grafanaTenantId string = grafanaWorkspace.identity.tenantId output grafanaWorkspaceLocation string = grafanaWorkspace.location +output dotnetEngServicesRoleAssignmentId string = grafanaAdminRoleAssignment.id From 85b72271c35634b10f26cefb95e9a3cbc0e2f7f4 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Wed, 15 Oct 2025 16:17:21 -0700 Subject: [PATCH 036/133] test grafana --- azure-pipelines-managed-grafana.yml | 42 +++++++++++++++++++++++++++++ eng/deploy-managed-grafana.yml | 6 ++--- 2 files changed, 45 insertions(+), 3 deletions(-) create mode 100644 azure-pipelines-managed-grafana.yml diff --git a/azure-pipelines-managed-grafana.yml b/azure-pipelines-managed-grafana.yml new file mode 100644 index 000000000..8bb47792d --- /dev/null +++ b/azure-pipelines-managed-grafana.yml @@ -0,0 +1,42 @@ +trigger: + batch: true + branches: + include: + - haruna/managed-grafana-new + - production +pr: none + +resources: + repositories: + - repository: 1ESPipelineTemplates + type: git + name: 1ESPipelineTemplates/1ESPipelineTemplates + ref: refs/tags/release +extends: + template: v1/1ES.Official.PipelineTemplate.yml@1ESPipelineTemplates + parameters: + pool: + name: NetCore1ESPool-Internal + image: 1es-windows-2022 + os: windows + sdl: + policheck: + enabled: true + tsa: + enabled: true + + stages: + - ${{ if in(variables['Build.SourceBranch'], 'refs/heads/haruna/managed-grafana-new', 'refs/heads/production')}}: + - template: /eng/deploy-managed-grafana.yml@self + parameters: + ${{ if ne(variables['Build.SourceBranch'], 'refs/heads/production') }}: + DeploymentEnvironment: Staging + ServiceConnectionName: dnceng-managed-grafana-1 + GrafanaWorkspaceName: dnceng-grafana-staging + ServiceConnectionId: f955b932-c7e3-48f7-9d67-4e6542b3568a + ${{ else }}: + DeploymentEnvironment: Production + ServiceConnectionName: dnceng-managed-grafana + GrafanaWorkspaceName: dnceng-grafana + ServiceConnectionId: 332b249e-769b-49a9-9dc9-d82afe28ec0a + diff --git a/eng/deploy-managed-grafana.yml b/eng/deploy-managed-grafana.yml index a61f9bef6..44c64f899 100644 --- a/eng/deploy-managed-grafana.yml +++ b/eng/deploy-managed-grafana.yml @@ -13,9 +13,9 @@ parameters: stages: - stage: ProvisionGrafana displayName: 'Provision Grafana Infrastructure' - dependsOn: - - predeploy - - approval + # dependsOn: + # - predeploy + # - approval jobs: - template: /eng/provision-grafana.yaml@self parameters: From 669da0d19ce00fcfd2d83489f58be674f5e912cc Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Wed, 15 Oct 2025 16:24:08 -0700 Subject: [PATCH 037/133] test grafana --- eng/deploy-managed-grafana.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/eng/deploy-managed-grafana.yml b/eng/deploy-managed-grafana.yml index 44c64f899..131cb0229 100644 --- a/eng/deploy-managed-grafana.yml +++ b/eng/deploy-managed-grafana.yml @@ -1,8 +1,6 @@ parameters: - name: ServiceConnectionName type: string -- name: ServiceConnectionClientId - type: string - name: ServiceConnectionId type: string - name: DeploymentEnvironment From 9b2a2e5e3a8ce6490ab364be08edd73a4996bba7 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Wed, 15 Oct 2025 16:25:24 -0700 Subject: [PATCH 038/133] test grafana --- azure-pipelines-managed-grafana.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/azure-pipelines-managed-grafana.yml b/azure-pipelines-managed-grafana.yml index 8bb47792d..e85e4da17 100644 --- a/azure-pipelines-managed-grafana.yml +++ b/azure-pipelines-managed-grafana.yml @@ -31,8 +31,8 @@ extends: parameters: ${{ if ne(variables['Build.SourceBranch'], 'refs/heads/production') }}: DeploymentEnvironment: Staging - ServiceConnectionName: dnceng-managed-grafana-1 - GrafanaWorkspaceName: dnceng-grafana-staging + ServiceConnectionName: dnceng-managed-grafana-staging + GrafanaWorkspaceName: dnceng-grafana-1 ServiceConnectionId: f955b932-c7e3-48f7-9d67-4e6542b3568a ${{ else }}: DeploymentEnvironment: Production From bfa288258432dee9eba8e74b9241e1fdea863c00 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Wed, 15 Oct 2025 22:36:36 -0700 Subject: [PATCH 039/133] remove group grafana admin assignment --- eng/deployment/azure-managed-grafana.bicep | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/eng/deployment/azure-managed-grafana.bicep b/eng/deployment/azure-managed-grafana.bicep index 1bf626a9a..21d97e5ec 100644 --- a/eng/deployment/azure-managed-grafana.bicep +++ b/eng/deployment/azure-managed-grafana.bicep @@ -12,12 +12,6 @@ param grafanaWorkspaceName string ]) param skuName string = 'Standard' -@description('The Azure AD Object ID of the .NET Eng Services group') -param dotnetEngServicesGroupId string = '65d7fc1d-2744-4669-8779-5cd7d7a6b95b' - -// Define the Grafana Admin role definition ID -var grafanaAdminRoleId = '22926164-76b3-42b3-bc55-97df8dab3e41' - // Azure Managed Grafana Workspace resource grafanaWorkspace 'Microsoft.Dashboard/grafana@2023-09-01' = { name: grafanaWorkspaceName @@ -40,17 +34,6 @@ resource grafanaWorkspace 'Microsoft.Dashboard/grafana@2023-09-01' = { } } -// Role assignment to grant Grafana Admin access to .NET Engineering Services group -resource grafanaAdminRoleAssignment 'Microsoft.Authorization/roleAssignments@2022-04-01' = { - name: guid(grafanaWorkspace.id, dotnetEngServicesGroupId, grafanaAdminRoleId) - scope: grafanaWorkspace - properties: { - roleDefinitionId: subscriptionResourceId('Microsoft.Authorization/roleDefinitions', grafanaAdminRoleId) - principalId: dotnetEngServicesGroupId - principalType: 'Group' - } -} - // Output the Grafana workspace details output grafanaWorkspaceId string = grafanaWorkspace.id output grafanaWorkspaceName string = grafanaWorkspace.name @@ -58,4 +41,3 @@ output grafanaWorkspaceUrl string = grafanaWorkspace.properties.endpoint output grafanaPrincipalId string = grafanaWorkspace.identity.principalId output grafanaTenantId string = grafanaWorkspace.identity.tenantId output grafanaWorkspaceLocation string = grafanaWorkspace.location -output dotnetEngServicesRoleAssignmentId string = grafanaAdminRoleAssignment.id From bc29c7b16d45f69294e96b1d96036574c9abde84 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Wed, 15 Oct 2025 23:19:13 -0700 Subject: [PATCH 040/133] add user assigned managed identity --- eng/deployment/azure-managed-grafana.bicep | 25 +++++++++++++++++++--- eng/provision-grafana.yaml | 22 +++++++++++++++++-- 2 files changed, 42 insertions(+), 5 deletions(-) diff --git a/eng/deployment/azure-managed-grafana.bicep b/eng/deployment/azure-managed-grafana.bicep index 21d97e5ec..a42479b16 100644 --- a/eng/deployment/azure-managed-grafana.bicep +++ b/eng/deployment/azure-managed-grafana.bicep @@ -12,6 +12,20 @@ param grafanaWorkspaceName string ]) param skuName string = 'Standard' +@description('The deployment environment (Staging or Production)') +param environment string + +// User-assigned managed identity for Grafana +resource grafanaUserAssignedIdentity 'Microsoft.ManagedIdentity/userAssignedIdentities@2023-01-31' = { + name: environment == 'Production' ? 'dnceng-managed-grafana' : 'dnceng-managed-grafana-staging' + location: location + tags: { + Environment: environment + Purpose: 'Azure Managed Grafana' + Service: 'DncEng' + } +} + // Azure Managed Grafana Workspace resource grafanaWorkspace 'Microsoft.Dashboard/grafana@2023-09-01' = { name: grafanaWorkspaceName @@ -20,7 +34,10 @@ resource grafanaWorkspace 'Microsoft.Dashboard/grafana@2023-09-01' = { name: skuName } identity: { - type: 'SystemAssigned' + type: 'UserAssigned' + userAssignedIdentities: { + '${grafanaUserAssignedIdentity.id}': {} + } } properties: { deterministicOutboundIP: 'Enabled' @@ -38,6 +55,8 @@ resource grafanaWorkspace 'Microsoft.Dashboard/grafana@2023-09-01' = { output grafanaWorkspaceId string = grafanaWorkspace.id output grafanaWorkspaceName string = grafanaWorkspace.name output grafanaWorkspaceUrl string = grafanaWorkspace.properties.endpoint -output grafanaPrincipalId string = grafanaWorkspace.identity.principalId -output grafanaTenantId string = grafanaWorkspace.identity.tenantId +output grafanaPrincipalId string = grafanaUserAssignedIdentity.properties.principalId +output grafanaTenantId string = grafanaUserAssignedIdentity.properties.tenantId output grafanaWorkspaceLocation string = grafanaWorkspace.location +output grafanaUserAssignedIdentityId string = grafanaUserAssignedIdentity.id +output grafanaUserAssignedIdentityName string = grafanaUserAssignedIdentity.name diff --git a/eng/provision-grafana.yaml b/eng/provision-grafana.yaml index bbda67023..5a5c92321 100644 --- a/eng/provision-grafana.yaml +++ b/eng/provision-grafana.yaml @@ -84,7 +84,7 @@ jobs: location: '${{ parameters.GrafanaLocation }}' templateLocation: 'Linked artifact' csmFile: 'eng/deployment/azure-managed-grafana.bicep' - overrideParameters: '-location "${{ parameters.GrafanaLocation }}" -grafanaWorkspaceName "${{ parameters.GrafanaWorkspaceName }}" -skuName "${{ parameters.GrafanaSkuName }}"' + overrideParameters: '-location "${{ parameters.GrafanaLocation }}" -grafanaWorkspaceName "${{ parameters.GrafanaWorkspaceName }}" -skuName "${{ parameters.GrafanaSkuName }}" -environment "${{ parameters.DeploymentEnvironment }}"' deploymentMode: 'Incremental' deploymentName: 'grafana-${{ parameters.DeploymentEnvironment }}-$(Build.BuildNumber)' deploymentOutputs: 'grafanaOutputs' @@ -144,7 +144,25 @@ jobs: Write-Host " Location: $($workspace.location)" Write-Host " SKU: $($workspace.sku.name)" Write-Host " Status: $($workspace.properties.provisioningState)" - Write-Host " Identity: $($workspace.identity.principalId)" + Write-Host " Identity Type: $($workspace.identity.type)" + + # Display user-assigned identity details + if ($workspace.identity.type -eq "UserAssigned") { + $userIdentities = $workspace.identity.userAssignedIdentities + if ($userIdentities) { + Write-Host " User-Assigned Identities:" + $userIdentities.PSObject.Properties | ForEach-Object { + $identityId = $_.Name + $identityName = $identityId.Split('/')[-1] + Write-Host " Name: $identityName" + Write-Host " Resource ID: $identityId" + Write-Host " Principal ID: $($_.Value.principalId)" + Write-Host " Client ID: $($_.Value.clientId)" + } + } + } else { + Write-Host " Principal ID: $($workspace.identity.principalId)" + } # Verify role assignments Write-Host "Checking role assignments..." From 0351d84fa4162067457f304c92de881deda24c1f Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Wed, 15 Oct 2025 23:57:03 -0700 Subject: [PATCH 041/133] add user assigned managed identity --- eng/deployment/azure-managed-grafana.bicep | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eng/deployment/azure-managed-grafana.bicep b/eng/deployment/azure-managed-grafana.bicep index a42479b16..6fc3bc6f2 100644 --- a/eng/deployment/azure-managed-grafana.bicep +++ b/eng/deployment/azure-managed-grafana.bicep @@ -17,7 +17,7 @@ param environment string // User-assigned managed identity for Grafana resource grafanaUserAssignedIdentity 'Microsoft.ManagedIdentity/userAssignedIdentities@2023-01-31' = { - name: environment == 'Production' ? 'dnceng-managed-grafana' : 'dnceng-managed-grafana-staging' + name: environment == 'Production' ? 'dnceng-managed-grafana' : 'dnceng-managed-grafana-1' location: location tags: { Environment: environment From 363a9af8c83ad56eb1d0992058c82c1b5893d888 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Thu, 16 Oct 2025 13:19:26 -0700 Subject: [PATCH 042/133] add key vault for grafana --- azure-pipelines-managed-grafana.yml | 2 +- eng/deployment/azure-managed-grafana.bicep | 67 +++++++++++++++++++++- eng/provision-grafana.yaml | 14 +++++ 3 files changed, 81 insertions(+), 2 deletions(-) diff --git a/azure-pipelines-managed-grafana.yml b/azure-pipelines-managed-grafana.yml index e85e4da17..5ec4a2c21 100644 --- a/azure-pipelines-managed-grafana.yml +++ b/azure-pipelines-managed-grafana.yml @@ -32,7 +32,7 @@ extends: ${{ if ne(variables['Build.SourceBranch'], 'refs/heads/production') }}: DeploymentEnvironment: Staging ServiceConnectionName: dnceng-managed-grafana-staging - GrafanaWorkspaceName: dnceng-grafana-1 + GrafanaWorkspaceName: dnceng-grafana-staging ServiceConnectionId: f955b932-c7e3-48f7-9d67-4e6542b3568a ${{ else }}: DeploymentEnvironment: Production diff --git a/eng/deployment/azure-managed-grafana.bicep b/eng/deployment/azure-managed-grafana.bicep index 6fc3bc6f2..a1cf57707 100644 --- a/eng/deployment/azure-managed-grafana.bicep +++ b/eng/deployment/azure-managed-grafana.bicep @@ -12,12 +12,29 @@ param grafanaWorkspaceName string ]) param skuName string = 'Standard' +@description('The pricing tier for the Grafana key vault') +@allowed([ + 'standard' + 'premium' +]) +param kvSkuName string = 'standard' + +@description('The key vault sku family') +@allowed([ + 'A' + 'premium' +]) +param kvSkuFamily string = 'A' + @description('The deployment environment (Staging or Production)') param environment string +@description('The tenant ID for Azure AD') +param tenantId string = tenant().tenantId + // User-assigned managed identity for Grafana resource grafanaUserAssignedIdentity 'Microsoft.ManagedIdentity/userAssignedIdentities@2023-01-31' = { - name: environment == 'Production' ? 'dnceng-managed-grafana' : 'dnceng-managed-grafana-1' + name: environment == 'Production' ? 'dnceng-managed-grafana' : 'dnceng-managed-grafana-staging' location: location tags: { Environment: environment @@ -26,6 +43,49 @@ resource grafanaUserAssignedIdentity 'Microsoft.ManagedIdentity/userAssignedIden } } +// Azure Key Vault for Grafana secrets +resource grafanaKeyVault 'Microsoft.KeyVault/vaults@2023-07-01' = { + name: environment == 'Production' ? 'dnceng-grafana-prod-kv' : 'dnceng-grafana-int-kv' + location: location + tags: { + Environment: environment + Purpose: 'Azure Managed Grafana Secrets' + Service: 'DncEng' + } + properties: { + sku: { + family: kvSkuFamily + name: kvSkuName + } + tenantId: tenantId + enabledForDeployment: false + enabledForDiskEncryption: false + enabledForTemplateDeployment: true + enableSoftDelete: true + softDeleteRetentionInDays: 90 + enableRbacAuthorization: true + enablePurgeProtection: true + publicNetworkAccess: 'Enabled' + networkAcls: { + bypass: 'AzureServices' + defaultAction: 'Allow' + } + } +} + +// Grant Key Vault Secrets Officer role to Grafana managed identity +var keyVaultSecretsOfficerRoleId = 'b86a8fe4-44ce-4948-aee5-eccb2c155cd7' + +resource grafanaKeyVaultSecretsOfficerRole 'Microsoft.Authorization/roleAssignments@2022-04-01' = { + name: guid(grafanaKeyVault.id, grafanaUserAssignedIdentity.id, keyVaultSecretsOfficerRoleId) + scope: grafanaKeyVault + properties: { + roleDefinitionId: subscriptionResourceId('Microsoft.Authorization/roleDefinitions', keyVaultSecretsOfficerRoleId) + principalId: grafanaUserAssignedIdentity.properties.principalId + principalType: 'ServicePrincipal' + } +} + // Azure Managed Grafana Workspace resource grafanaWorkspace 'Microsoft.Dashboard/grafana@2023-09-01' = { name: grafanaWorkspaceName @@ -60,3 +120,8 @@ output grafanaTenantId string = grafanaUserAssignedIdentity.properties.tenantId output grafanaWorkspaceLocation string = grafanaWorkspace.location output grafanaUserAssignedIdentityId string = grafanaUserAssignedIdentity.id output grafanaUserAssignedIdentityName string = grafanaUserAssignedIdentity.name + +// Output Key Vault details +output keyVaultId string = grafanaKeyVault.id +output keyVaultName string = grafanaKeyVault.name +output keyVaultUri string = grafanaKeyVault.properties.vaultUri diff --git a/eng/provision-grafana.yaml b/eng/provision-grafana.yaml index 5a5c92321..43d17d399 100644 --- a/eng/provision-grafana.yaml +++ b/eng/provision-grafana.yaml @@ -175,4 +175,18 @@ jobs: Write-Host " No role assignments found" } + # Verify Key Vault + $kvName = if ("${{ parameters.DeploymentEnvironment }}" -eq "Production") { "dnceng-grafana-prod-kv" } else { "dnceng-grafana-int-kv" } + Write-Host "" + Write-Host "KEY VAULT DETAILS:" + $keyVault = az keyvault show --name $kvName --resource-group $rgName --query '{name:name, vaultUri:properties.vaultUri, sku:properties.sku.name}' -o json 2>$null | ConvertFrom-Json + if ($keyVault) { + Write-Host " Name: $($keyVault.name)" + Write-Host " Vault URI: $($keyVault.vaultUri)" + Write-Host " SKU: $($keyVault.sku)" + Write-Host " Status: Configured" + } else { + Write-Host " Status: Not found or not accessible" + } + Write-Host "SUCCESS: ${{ parameters.DeploymentEnvironment }} Grafana deployment verification completed" \ No newline at end of file From f3e80366142e6ff00281bf0325d246bf9717f6c6 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Thu, 16 Oct 2025 13:23:01 -0700 Subject: [PATCH 043/133] change resource group name --- eng/deploy-managed-grafana.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eng/deploy-managed-grafana.yml b/eng/deploy-managed-grafana.yml index 131cb0229..86d6b39cd 100644 --- a/eng/deploy-managed-grafana.yml +++ b/eng/deploy-managed-grafana.yml @@ -19,6 +19,6 @@ stages: parameters: DeploymentEnvironment: ${{ parameters.DeploymentEnvironment }} ServiceConnectionName: ${{ parameters.ServiceConnectionName }} - GrafanaResourceGroup: 'monitoring-managed' + GrafanaResourceGroup: 'monitoring-managed-new' GrafanaWorkspaceName: ${{ parameters.GrafanaWorkspaceName }} GrafanaLocation: 'westus2' From 6fae23b2439785b107dadd4b5ec9c91d88b8a156 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Thu, 16 Oct 2025 13:33:15 -0700 Subject: [PATCH 044/133] change resource group validation script --- eng/provision-grafana.yaml | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/eng/provision-grafana.yaml b/eng/provision-grafana.yaml index 43d17d399..a9e2cd7fe 100644 --- a/eng/provision-grafana.yaml +++ b/eng/provision-grafana.yaml @@ -57,21 +57,24 @@ jobs: scriptType: 'ps' scriptLocation: 'inlineScript' inlineScript: | + $ErrorActionPreference = 'Continue' $rgName = "${{ parameters.GrafanaResourceGroup }}" $location = "${{ parameters.GrafanaLocation }}" Write-Host "Checking if resource group '$rgName' exists..." - $rg = az group show --name $rgName --query "name" --output tsv 2>$null - if ($LASTEXITCODE -ne 0) { - Write-Host "Creating resource group '$rgName' in '$location'..." - az group create --name $rgName --location $location + # Check if resource group exists + $exists = az group exists --name $rgName + + if ($exists -eq 'false') { + Write-Host "Resource group does not exist. Creating resource group '$rgName' in '$location'..." + az group create --name $rgName --location $location --output none if ($LASTEXITCODE -ne 0) { throw "Failed to create resource group '$rgName'" } Write-Host "SUCCESS: Resource group created successfully" } else { - Write-Host "SUCCESS: Resource group already exists" + Write-Host "SUCCESS: Resource group '$rgName' already exists" } - task: AzureResourceManagerTemplateDeployment@3 From 170a6c227b361e5186ae01b471535c02745d7890 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Thu, 16 Oct 2025 14:08:27 -0700 Subject: [PATCH 045/133] change service connection --- azure-pipelines.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 42a63b3bc..cb36d019b 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -276,10 +276,10 @@ extends: parameters: ${{ if ne(variables['Build.SourceBranch'], 'refs/heads/production') }}: DeploymentEnvironment: Staging - ServiceConnectionName: dnceng-managed-grafana-staging + ServiceConnectionName: NetHelixStaging GrafanaWorkspaceName: dnceng-grafana-staging - ServiceConnectionClientId: 4ad9ae35-2d42-4245-a954-9003b7e31349 - ServiceConnectionId: f955b932-c7e3-48f7-9d67-4e6542b3568a + ServiceConnectionClientId: 57f299da-15de-4117-b8f6-7c10451926f0 + ServiceConnectionId: 7829de7e-fb4e-4118-8370-475d6bc61905 ${{ else }}: DeploymentEnvironment: Production ServiceConnectionName: dnceng-managed-grafana From 8a6a3a465cb2db346b9ce08e17f656d62d285450 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Thu, 16 Oct 2025 14:43:14 -0700 Subject: [PATCH 046/133] change service connection --- azure-pipelines-managed-grafana.yml | 4 ++-- azure-pipelines.yml | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/azure-pipelines-managed-grafana.yml b/azure-pipelines-managed-grafana.yml index 5ec4a2c21..32fe1e50d 100644 --- a/azure-pipelines-managed-grafana.yml +++ b/azure-pipelines-managed-grafana.yml @@ -31,9 +31,9 @@ extends: parameters: ${{ if ne(variables['Build.SourceBranch'], 'refs/heads/production') }}: DeploymentEnvironment: Staging - ServiceConnectionName: dnceng-managed-grafana-staging + ServiceConnectionName: NetHelixStaging GrafanaWorkspaceName: dnceng-grafana-staging - ServiceConnectionId: f955b932-c7e3-48f7-9d67-4e6542b3568a + ServiceConnectionId: 7829de7e-fb4e-4118-8370-475d6bc61905 ${{ else }}: DeploymentEnvironment: Production ServiceConnectionName: dnceng-managed-grafana diff --git a/azure-pipelines.yml b/azure-pipelines.yml index cb36d019b..42a63b3bc 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -276,10 +276,10 @@ extends: parameters: ${{ if ne(variables['Build.SourceBranch'], 'refs/heads/production') }}: DeploymentEnvironment: Staging - ServiceConnectionName: NetHelixStaging + ServiceConnectionName: dnceng-managed-grafana-staging GrafanaWorkspaceName: dnceng-grafana-staging - ServiceConnectionClientId: 57f299da-15de-4117-b8f6-7c10451926f0 - ServiceConnectionId: 7829de7e-fb4e-4118-8370-475d6bc61905 + ServiceConnectionClientId: 4ad9ae35-2d42-4245-a954-9003b7e31349 + ServiceConnectionId: f955b932-c7e3-48f7-9d67-4e6542b3568a ${{ else }}: DeploymentEnvironment: Production ServiceConnectionName: dnceng-managed-grafana From a13a12db7b90e22f9e022e39e75e8db1d9e157c0 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Thu, 16 Oct 2025 15:22:25 -0700 Subject: [PATCH 047/133] change service connection to use nethelix sc --- azure-pipelines-managed-grafana.yml | 4 ++-- azure-pipelines.yml | 10 ++++------ eng/deploy-managed-grafana.yml | 2 +- 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/azure-pipelines-managed-grafana.yml b/azure-pipelines-managed-grafana.yml index 32fe1e50d..9de83681d 100644 --- a/azure-pipelines-managed-grafana.yml +++ b/azure-pipelines-managed-grafana.yml @@ -36,7 +36,7 @@ extends: ServiceConnectionId: 7829de7e-fb4e-4118-8370-475d6bc61905 ${{ else }}: DeploymentEnvironment: Production - ServiceConnectionName: dnceng-managed-grafana + ServiceConnectionName: NetHelix GrafanaWorkspaceName: dnceng-grafana - ServiceConnectionId: 332b249e-769b-49a9-9dc9-d82afe28ec0a + ServiceConnectionId: a511f6f-b538-48e6-a389-207e430634d1 diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 42a63b3bc..f23a8e2dc 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -276,13 +276,11 @@ extends: parameters: ${{ if ne(variables['Build.SourceBranch'], 'refs/heads/production') }}: DeploymentEnvironment: Staging - ServiceConnectionName: dnceng-managed-grafana-staging + ServiceConnectionName: NetHelixStaging GrafanaWorkspaceName: dnceng-grafana-staging - ServiceConnectionClientId: 4ad9ae35-2d42-4245-a954-9003b7e31349 - ServiceConnectionId: f955b932-c7e3-48f7-9d67-4e6542b3568a + ServiceConnectionId: 7829de7e-fb4e-4118-8370-475d6bc61905 ${{ else }}: DeploymentEnvironment: Production - ServiceConnectionName: dnceng-managed-grafana + ServiceConnectionName: NetHelix GrafanaWorkspaceName: dnceng-grafana - ServiceConnectionClientId: 0ceeca1a-31e7-49ee-9bf4-15f14ed28fa4 - ServiceConnectionId: 332b249e-769b-49a9-9dc9-d82afe28ec0a \ No newline at end of file + ServiceConnectionId: a511f6f-b538-48e6-a389-207e430634d1 \ No newline at end of file diff --git a/eng/deploy-managed-grafana.yml b/eng/deploy-managed-grafana.yml index 86d6b39cd..131cb0229 100644 --- a/eng/deploy-managed-grafana.yml +++ b/eng/deploy-managed-grafana.yml @@ -19,6 +19,6 @@ stages: parameters: DeploymentEnvironment: ${{ parameters.DeploymentEnvironment }} ServiceConnectionName: ${{ parameters.ServiceConnectionName }} - GrafanaResourceGroup: 'monitoring-managed-new' + GrafanaResourceGroup: 'monitoring-managed' GrafanaWorkspaceName: ${{ parameters.GrafanaWorkspaceName }} GrafanaLocation: 'westus2' From 330b56fb96c2b458653928f40a892ffa7e2ed478 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Thu, 16 Oct 2025 15:33:11 -0700 Subject: [PATCH 048/133] grant the managed identity permissions to the keyvault --- eng/deployment/azure-managed-grafana.bicep | 50 +++++++++++++++++++++- 1 file changed, 49 insertions(+), 1 deletion(-) diff --git a/eng/deployment/azure-managed-grafana.bicep b/eng/deployment/azure-managed-grafana.bicep index a1cf57707..f11e81430 100644 --- a/eng/deployment/azure-managed-grafana.bicep +++ b/eng/deployment/azure-managed-grafana.bicep @@ -73,8 +73,12 @@ resource grafanaKeyVault 'Microsoft.KeyVault/vaults@2023-07-01' = { } } -// Grant Key Vault Secrets Officer role to Grafana managed identity +// Define Key Vault role IDs var keyVaultSecretsOfficerRoleId = 'b86a8fe4-44ce-4948-aee5-eccb2c155cd7' +var readerRoleId = 'acdd72a7-3385-48ef-bd42-f606fba81ae7' +var keyVaultCertificateUserRoleId = 'db79e9a7-68ee-4b58-9aeb-b90e7c24fcba' +var keyVaultCryptoUserRoleId = '12338af0-0e69-4776-bea7-57ae8d297424' +var keyVaultSecretsUserRoleId = '4633458b-17de-408a-b874-0445c86b69e6' resource grafanaKeyVaultSecretsOfficerRole 'Microsoft.Authorization/roleAssignments@2022-04-01' = { name: guid(grafanaKeyVault.id, grafanaUserAssignedIdentity.id, keyVaultSecretsOfficerRoleId) @@ -86,6 +90,50 @@ resource grafanaKeyVaultSecretsOfficerRole 'Microsoft.Authorization/roleAssignme } } +// Grant Reader role to Grafana managed identity +resource grafanaKeyVaultReaderRole 'Microsoft.Authorization/roleAssignments@2022-04-01' = { + name: guid(grafanaKeyVault.id, grafanaUserAssignedIdentity.id, readerRoleId) + scope: grafanaKeyVault + properties: { + roleDefinitionId: subscriptionResourceId('Microsoft.Authorization/roleDefinitions', readerRoleId) + principalId: grafanaUserAssignedIdentity.properties.principalId + principalType: 'ServicePrincipal' + } +} + +// Grant Key Vault Certificate User role to Grafana managed identity +resource grafanaKeyVaultCertificateUserRole 'Microsoft.Authorization/roleAssignments@2022-04-01' = { + name: guid(grafanaKeyVault.id, grafanaUserAssignedIdentity.id, keyVaultCertificateUserRoleId) + scope: grafanaKeyVault + properties: { + roleDefinitionId: subscriptionResourceId('Microsoft.Authorization/roleDefinitions', keyVaultCertificateUserRoleId) + principalId: grafanaUserAssignedIdentity.properties.principalId + principalType: 'ServicePrincipal' + } +} + +// Grant Key Vault Crypto User role to Grafana managed identity +resource grafanaKeyVaultCryptoUserRole 'Microsoft.Authorization/roleAssignments@2022-04-01' = { + name: guid(grafanaKeyVault.id, grafanaUserAssignedIdentity.id, keyVaultCryptoUserRoleId) + scope: grafanaKeyVault + properties: { + roleDefinitionId: subscriptionResourceId('Microsoft.Authorization/roleDefinitions', keyVaultCryptoUserRoleId) + principalId: grafanaUserAssignedIdentity.properties.principalId + principalType: 'ServicePrincipal' + } +} + +// Grant Key Vault Secrets User role to Grafana managed identity +resource grafanaKeyVaultSecretsUserRole 'Microsoft.Authorization/roleAssignments@2022-04-01' = { + name: guid(grafanaKeyVault.id, grafanaUserAssignedIdentity.id, keyVaultSecretsUserRoleId) + scope: grafanaKeyVault + properties: { + roleDefinitionId: subscriptionResourceId('Microsoft.Authorization/roleDefinitions', keyVaultSecretsUserRoleId) + principalId: grafanaUserAssignedIdentity.properties.principalId + principalType: 'ServicePrincipal' + } +} + // Azure Managed Grafana Workspace resource grafanaWorkspace 'Microsoft.Dashboard/grafana@2023-09-01' = { name: grafanaWorkspaceName From b5306a462e21ab2256389a4e47225fe8cc64d22b Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Thu, 16 Oct 2025 23:06:37 -0700 Subject: [PATCH 049/133] give the .net eng services group grafana admin permissions --- eng/deployment/azure-managed-grafana.bicep | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/eng/deployment/azure-managed-grafana.bicep b/eng/deployment/azure-managed-grafana.bicep index f11e81430..e65cb396d 100644 --- a/eng/deployment/azure-managed-grafana.bicep +++ b/eng/deployment/azure-managed-grafana.bicep @@ -32,6 +32,9 @@ param environment string @description('The tenant ID for Azure AD') param tenantId string = tenant().tenantId +@description('The Azure AD Object ID of the .NET Engineering Services group') +param dotnetEngServicesGroupId string = '65d7fc1d-2744-4669-8779-5cd7d7a6b95b' + // User-assigned managed identity for Grafana resource grafanaUserAssignedIdentity 'Microsoft.ManagedIdentity/userAssignedIdentities@2023-01-31' = { name: environment == 'Production' ? 'dnceng-managed-grafana' : 'dnceng-managed-grafana-staging' @@ -80,6 +83,9 @@ var keyVaultCertificateUserRoleId = 'db79e9a7-68ee-4b58-9aeb-b90e7c24fcba' var keyVaultCryptoUserRoleId = '12338af0-0e69-4776-bea7-57ae8d297424' var keyVaultSecretsUserRoleId = '4633458b-17de-408a-b874-0445c86b69e6' +// Define Grafana Admin role ID +var grafanaAdminRoleId = '22926164-76b3-42b3-bc55-97df8dab3e41' + resource grafanaKeyVaultSecretsOfficerRole 'Microsoft.Authorization/roleAssignments@2022-04-01' = { name: guid(grafanaKeyVault.id, grafanaUserAssignedIdentity.id, keyVaultSecretsOfficerRoleId) scope: grafanaKeyVault @@ -159,6 +165,17 @@ resource grafanaWorkspace 'Microsoft.Dashboard/grafana@2023-09-01' = { } } +// Grant Grafana Admin role to .NET Engineering Services group +resource dotnetEngServicesGrafanaAdminRole 'Microsoft.Authorization/roleAssignments@2022-04-01' = { + name: guid(grafanaWorkspace.id, dotnetEngServicesGroupId, grafanaAdminRoleId) + scope: grafanaWorkspace + properties: { + roleDefinitionId: subscriptionResourceId('Microsoft.Authorization/roleDefinitions', grafanaAdminRoleId) + principalId: dotnetEngServicesGroupId + principalType: 'Group' + } +} + // Output the Grafana workspace details output grafanaWorkspaceId string = grafanaWorkspace.id output grafanaWorkspaceName string = grafanaWorkspace.name From d99edad367d270ea7ad83c6c2bd4275d4576de49 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Fri, 17 Oct 2025 11:51:35 -0700 Subject: [PATCH 050/133] change the service connection to Dotnet Engineering services --- azure-pipelines-managed-grafana.yml | 7 ++----- azure-pipelines.yml | 8 +++----- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/azure-pipelines-managed-grafana.yml b/azure-pipelines-managed-grafana.yml index 9de83681d..67d8d3d25 100644 --- a/azure-pipelines-managed-grafana.yml +++ b/azure-pipelines-managed-grafana.yml @@ -29,14 +29,11 @@ extends: - ${{ if in(variables['Build.SourceBranch'], 'refs/heads/haruna/managed-grafana-new', 'refs/heads/production')}}: - template: /eng/deploy-managed-grafana.yml@self parameters: + ServiceConnectionName: 'Dotnet Engineering services' + ServiceConnectionId: dd8c2cfc-b9c9-452c-a168-ccd4240ada55 ${{ if ne(variables['Build.SourceBranch'], 'refs/heads/production') }}: DeploymentEnvironment: Staging - ServiceConnectionName: NetHelixStaging GrafanaWorkspaceName: dnceng-grafana-staging - ServiceConnectionId: 7829de7e-fb4e-4118-8370-475d6bc61905 ${{ else }}: DeploymentEnvironment: Production - ServiceConnectionName: NetHelix GrafanaWorkspaceName: dnceng-grafana - ServiceConnectionId: a511f6f-b538-48e6-a389-207e430634d1 - diff --git a/azure-pipelines.yml b/azure-pipelines.yml index f23a8e2dc..21f328f83 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -274,13 +274,11 @@ extends: - template: /eng/deploy-managed-grafana.yml@self parameters: + ServiceConnectionName: 'Dotnet Engineering services' + ServiceConnectionId: dd8c2cfc-b9c9-452c-a168-ccd4240ada55 ${{ if ne(variables['Build.SourceBranch'], 'refs/heads/production') }}: DeploymentEnvironment: Staging - ServiceConnectionName: NetHelixStaging GrafanaWorkspaceName: dnceng-grafana-staging - ServiceConnectionId: 7829de7e-fb4e-4118-8370-475d6bc61905 ${{ else }}: DeploymentEnvironment: Production - ServiceConnectionName: NetHelix - GrafanaWorkspaceName: dnceng-grafana - ServiceConnectionId: a511f6f-b538-48e6-a389-207e430634d1 \ No newline at end of file + GrafanaWorkspaceName: dnceng-grafana \ No newline at end of file From 48cdf59a4f3cbb3ef4424fe10f5f02e7ef889378 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Fri, 17 Oct 2025 14:13:53 -0700 Subject: [PATCH 051/133] change grafana keyvault name --- eng/deployment/azure-managed-grafana.bicep | 2 +- eng/provision-grafana.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/eng/deployment/azure-managed-grafana.bicep b/eng/deployment/azure-managed-grafana.bicep index e65cb396d..3d79d4f94 100644 --- a/eng/deployment/azure-managed-grafana.bicep +++ b/eng/deployment/azure-managed-grafana.bicep @@ -48,7 +48,7 @@ resource grafanaUserAssignedIdentity 'Microsoft.ManagedIdentity/userAssignedIden // Azure Key Vault for Grafana secrets resource grafanaKeyVault 'Microsoft.KeyVault/vaults@2023-07-01' = { - name: environment == 'Production' ? 'dnceng-grafana-prod-kv' : 'dnceng-grafana-int-kv' + name: environment == 'Production' ? 'dnceng-amg-prod-kv' : 'dnceng-amg-int-kv' location: location tags: { Environment: environment diff --git a/eng/provision-grafana.yaml b/eng/provision-grafana.yaml index a9e2cd7fe..f86ac268b 100644 --- a/eng/provision-grafana.yaml +++ b/eng/provision-grafana.yaml @@ -179,7 +179,7 @@ jobs: } # Verify Key Vault - $kvName = if ("${{ parameters.DeploymentEnvironment }}" -eq "Production") { "dnceng-grafana-prod-kv" } else { "dnceng-grafana-int-kv" } + $kvName = if ("${{ parameters.DeploymentEnvironment }}" -eq "Production") { "dnceng-amg-prod-kv" } else { "dnceng-amg-int-kv" } Write-Host "" Write-Host "KEY VAULT DETAILS:" $keyVault = az keyvault show --name $kvName --resource-group $rgName --query '{name:name, vaultUri:properties.vaultUri, sku:properties.sku.name}' -o json 2>$null | ConvertFrom-Json From c10a8d8bf637b779450d824200dfcc99f1419965 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Mon, 3 Nov 2025 22:33:12 -0800 Subject: [PATCH 052/133] add application gateway --- eng/deployment/azure-appgw-grafana.bicep | 233 +++++++++++++++++++++++ eng/provision-appgw.yaml | 199 +++++++++++++++++++ 2 files changed, 432 insertions(+) create mode 100644 eng/deployment/azure-appgw-grafana.bicep create mode 100644 eng/provision-appgw.yaml diff --git a/eng/deployment/azure-appgw-grafana.bicep b/eng/deployment/azure-appgw-grafana.bicep new file mode 100644 index 000000000..370da85fc --- /dev/null +++ b/eng/deployment/azure-appgw-grafana.bicep @@ -0,0 +1,233 @@ +// Azure Application Gateway with cloudapp.azure.com domains for Azure Managed Grafana +@description('The Azure region where the Application Gateway will be deployed') +param location string + +@description('The deployment environment (Staging or Production)') +param environment string + +@description('The Grafana workspace endpoint URL (without https://)') +param grafanaEndpoint string + +@description('The SKU name for Application Gateway') +@allowed([ + 'Standard_v2' + 'WAF_v2' +]) +param skuName string = 'Standard_v2' + +@description('The SKU tier for Application Gateway') +@allowed([ + 'Standard_v2' + 'WAF_v2' +]) +param skuTier string = 'Standard_v2' + +@description('The capacity (instance count) for Application Gateway') +@minValue(1) +@maxValue(10) +param capacity int = 2 + +@description('Tags to apply to resources') +param resourceTags object = { + Environment: environment + Purpose: 'Azure Managed Grafana Custom Domain' + Service: 'DncEng' +} + +// Generate custom domain name based on environment and region +// Format: dnceng-managed-grafana[-staging].{region}.cloudapp.azure.com +var regionShortName = location == 'westus2' ? 'westus2' : location +var publicDnsLabel = environment == 'Production' ? 'dnceng-managed-grafana' : 'dnceng-managed-grafana-staging' +var customDomainName = '${publicDnsLabel}.${regionShortName}.cloudapp.azure.com' + +// Resource names +var appGwName = environment == 'Production' ? 'dnceng-grafana-appgw' : 'dnceng-grafana-staging-appgw' +var publicIpName = environment == 'Production' ? 'dnceng-grafana-pip' : 'dnceng-grafana-staging-pip' +var vnetName = environment == 'Production' ? 'dnceng-grafana-vnet' : 'dnceng-grafana-staging-vnet' +var subnetName = 'appgw-subnet' +var backendPoolName = 'grafana-backend-pool' +var frontendPortName = 'http-port' +var frontendIpConfigName = 'appgw-frontend-ip' +var httpSettingName = 'grafana-http-setting' +var listenerName = 'http-listener' +var ruleName = 'grafana-routing-rule' +var probeName = 'grafana-health-probe' + +// Virtual Network for Application Gateway +resource vnet 'Microsoft.Network/virtualNetworks@2023-05-01' = { + name: vnetName + location: location + tags: resourceTags + properties: { + addressSpace: { + addressPrefixes: [ + '10.0.0.0/16' + ] + } + subnets: [ + { + name: subnetName + properties: { + addressPrefix: '10.0.0.0/24' + privateEndpointNetworkPolicies: 'Disabled' + privateLinkServiceNetworkPolicies: 'Disabled' + } + } + ] + } +} + +// Public IP for Application Gateway with custom DNS label (creates cloudapp.azure.com domain) +resource publicIp 'Microsoft.Network/publicIPAddresses@2023-05-01' = { + name: publicIpName + location: location + tags: resourceTags + sku: { + name: 'Standard' + tier: 'Regional' + } + properties: { + publicIPAllocationMethod: 'Static' + publicIPAddressVersion: 'IPv4' + dnsSettings: { + domainNameLabel: publicDnsLabel + } + idleTimeoutInMinutes: 4 + } +} + +// Application Gateway +resource applicationGateway 'Microsoft.Network/applicationGateways@2023-05-01' = { + name: appGwName + location: location + tags: resourceTags + properties: { + sku: { + name: skuName + tier: skuTier + capacity: capacity + } + gatewayIPConfigurations: [ + { + name: 'appgw-ip-config' + properties: { + subnet: { + id: vnet.properties.subnets[0].id + } + } + } + ] + frontendIPConfigurations: [ + { + name: frontendIpConfigName + properties: { + publicIPAddress: { + id: publicIp.id + } + } + } + ] + frontendPorts: [ + { + name: frontendPortName + properties: { + port: 80 + } + } + ] + backendAddressPools: [ + { + name: backendPoolName + properties: { + backendAddresses: [ + { + fqdn: grafanaEndpoint + } + ] + } + } + ] + backendHttpSettingsCollection: [ + { + name: httpSettingName + properties: { + port: 443 + protocol: 'Https' + cookieBasedAffinity: 'Enabled' + pickHostNameFromBackendAddress: true + requestTimeout: 30 + probe: { + id: resourceId('Microsoft.Network/applicationGateways/probes', appGwName, probeName) + } + } + } + ] + httpListeners: [ + { + name: listenerName + properties: { + frontendIPConfiguration: { + id: resourceId('Microsoft.Network/applicationGateways/frontendIPConfigurations', appGwName, frontendIpConfigName) + } + frontendPort: { + id: resourceId('Microsoft.Network/applicationGateways/frontendPorts', appGwName, frontendPortName) + } + protocol: 'Http' + requireServerNameIndication: false + } + } + ] + requestRoutingRules: [ + { + name: ruleName + properties: { + ruleType: 'Basic' + priority: 100 + httpListener: { + id: resourceId('Microsoft.Network/applicationGateways/httpListeners', appGwName, listenerName) + } + backendAddressPool: { + id: resourceId('Microsoft.Network/applicationGateways/backendAddressPools', appGwName, backendPoolName) + } + backendHttpSettings: { + id: resourceId('Microsoft.Network/applicationGateways/backendHttpSettingsCollection', appGwName, httpSettingName) + } + } + } + ] + probes: [ + { + name: probeName + properties: { + protocol: 'Https' + path: '/api/health' + interval: 30 + timeout: 30 + unhealthyThreshold: 3 + pickHostNameFromBackendHttpSettings: true + minServers: 0 + match: { + statusCodes: [ + '200-399' + ] + } + } + } + ] + enableHttp2: true + } +} + +// Outputs +output applicationGatewayId string = applicationGateway.id +output applicationGatewayName string = applicationGateway.name +output publicIpAddress string = publicIp.properties.ipAddress +output publicDnsLabel string = publicDnsLabel +output customDomainName string = customDomainName +output customDomainUrl string = 'http://${customDomainName}' +output vnetId string = vnet.id +output vnetName string = vnet.name + +// Usage instructions +output usageInstructions string = 'Access Grafana at: http://${customDomainName} (Application Gateway proxies HTTP to HTTPS backend)' +output accessUrl string = 'http://${customDomainName}' diff --git a/eng/provision-appgw.yaml b/eng/provision-appgw.yaml new file mode 100644 index 000000000..d757762e8 --- /dev/null +++ b/eng/provision-appgw.yaml @@ -0,0 +1,199 @@ +parameters: + - name: SubscriptionId + type: string + - name: ServiceConnection + type: string + - name: ResourceGroupName + type: string + - name: Environment + type: string + - name: GrafanaWorkspaceName + type: string + - name: Location + type: string + default: 'westus2' + +steps: + - task: AzureCLI@2 + displayName: 'Validate Application Gateway Bicep Template' + inputs: + azureSubscription: ${{ parameters.ServiceConnection }} + scriptType: 'pscore' + scriptLocation: 'inlineScript' + inlineScript: | + Write-Host "Validating Application Gateway Bicep template..." + az bicep build --file eng/deployment/azure-appgw-grafana.bicep + if ($LASTEXITCODE -ne 0) { + Write-Error "Bicep validation failed" + exit 1 + } + Write-Host "āœ“ Application Gateway Bicep template is valid" + + - task: AzureCLI@2 + displayName: 'Get Grafana Endpoint' + inputs: + azureSubscription: ${{ parameters.ServiceConnection }} + scriptType: 'pscore' + scriptLocation: 'inlineScript' + inlineScript: | + Write-Host "Retrieving Grafana workspace endpoint..." + $grafanaEndpointFull = az grafana show ` + --name "${{ parameters.GrafanaWorkspaceName }}" ` + --resource-group "${{ parameters.ResourceGroupName }}" ` + --query "properties.endpoint" ` + --output tsv + + if ([string]::IsNullOrEmpty($grafanaEndpointFull)) { + Write-Error "Failed to retrieve Grafana endpoint" + exit 1 + } + + # Remove https:// prefix and trailing slash for Application Gateway backend + $grafanaEndpoint = $grafanaEndpointFull -replace '^https://', '' -replace '/$', '' + + Write-Host "Grafana Full Endpoint: $grafanaEndpointFull" + Write-Host "Grafana Backend FQDN: $grafanaEndpoint" + Write-Host "##vso[task.setvariable variable=GrafanaEndpoint]$grafanaEndpoint" + Write-Host "##vso[task.setvariable variable=GrafanaEndpointFull]$grafanaEndpointFull" + + - task: AzureResourceManagerTemplateDeployment@3 + displayName: 'Deploy Application Gateway for Grafana' + inputs: + deploymentScope: 'Resource Group' + azureResourceManagerConnection: ${{ parameters.ServiceConnection }} + subscriptionId: ${{ parameters.SubscriptionId }} + action: 'Create Or Update Resource Group' + resourceGroupName: ${{ parameters.ResourceGroupName }} + location: ${{ parameters.Location }} + templateLocation: 'Linked artifact' + csmFile: 'eng/deployment/azure-appgw-grafana.bicep' + overrideParameters: >- + -environment "${{ parameters.Environment }}" + -location "${{ parameters.Location }}" + -grafanaEndpoint "$(GrafanaEndpoint)" + deploymentMode: 'Incremental' + deploymentOutputs: 'AppGatewayOutputs' + + - task: AzureCLI@2 + displayName: 'Display Custom Domain Information' + inputs: + azureSubscription: ${{ parameters.ServiceConnection }} + scriptType: 'pscore' + scriptLocation: 'inlineScript' + inlineScript: | + Write-Host "================================================" + Write-Host "APPLICATION GATEWAY CUSTOM DOMAIN DEPLOYED" + Write-Host "================================================" + + $outputs = '$(AppGatewayOutputs)' | ConvertFrom-Json + + $customDomain = $outputs.customDomainName.value + $customDomainUrl = $outputs.customDomainUrl.value + $accessUrl = $outputs.accessUrl.value + $publicIp = $outputs.publicIpAddress.value + $appGwName = $outputs.applicationGatewayName.value + + Write-Host "" + Write-Host "Environment: ${{ parameters.Environment }}" + Write-Host "Application Gateway: $appGwName" + Write-Host "Public IP Address: $publicIp" + Write-Host "Custom Domain: $customDomain" + Write-Host "" + Write-Host "================================================" + Write-Host "IMMEDIATE ACCESS (NO DNS SETUP REQUIRED)" + Write-Host "================================================" + Write-Host "" + Write-Host "šŸŽ‰ Your Grafana is immediately accessible at:" + Write-Host " $accessUrl" + Write-Host "" + Write-Host "The cloudapp.azure.com domain is automatically configured!" + Write-Host "No DNS records needed - the domain works right away." + Write-Host "" + Write-Host "Original Grafana Endpoint: $(GrafanaEndpointFull)" + Write-Host "Backend FQDN: $(GrafanaEndpoint)" + Write-Host "" + Write-Host "================================================" + Write-Host "NEXT STEPS" + Write-Host "================================================" + Write-Host "1. Wait 2-5 minutes for Application Gateway to become healthy" + Write-Host "2. Access Grafana at: $accessUrl" + Write-Host "3. HTTP traffic will be proxied to HTTPS backend" + Write-Host "4. Health probe monitors: $(GrafanaEndpointFull)/api/health" + Write-Host "" + Write-Host "The domain $customDomain is ready to use!" + Write-Host "" + Write-Host "================================================" + + - task: AzureCLI@2 + displayName: 'Verify Application Gateway Deployment' + inputs: + azureSubscription: ${{ parameters.ServiceConnection }} + scriptType: 'pscore' + scriptLocation: 'inlineScript' + inlineScript: | + Write-Host "Verifying Application Gateway deployment..." + + $outputs = '$(AppGatewayOutputs)' | ConvertFrom-Json + $appGwName = $outputs.applicationGatewayName.value + $customDomain = $outputs.customDomainName.value + + $maxAttempts = 10 + $attempt = 0 + $verified = $false + + while ($attempt -lt $maxAttempts -and -not $verified) { + $attempt++ + Write-Host "Verification attempt $attempt of $maxAttempts..." + + try { + $provisioningState = az network application-gateway show ` + --name "$appGwName" ` + --resource-group "${{ parameters.ResourceGroupName }}" ` + --query "provisioningState" ` + --output tsv + + Write-Host "Provisioning State: $provisioningState" + + if ($provisioningState -eq "Succeeded") { + Write-Host "āœ“ Application Gateway deployed successfully" + + # Check operational state + $operationalState = az network application-gateway show ` + --name "$appGwName" ` + --resource-group "${{ parameters.ResourceGroupName }}" ` + --query "operationalState" ` + --output tsv + + Write-Host "Operational State: $operationalState" + + if ($operationalState -eq "Running") { + Write-Host "āœ“ Application Gateway is running" + $verified = $true + } else { + Write-Host "Waiting for Application Gateway to start..." + Start-Sleep -Seconds 15 + } + } else { + Write-Host "Application Gateway still provisioning..." + Start-Sleep -Seconds 15 + } + } catch { + Write-Warning "Verification attempt $attempt failed: $_" + Start-Sleep -Seconds 15 + } + } + + if (-not $verified) { + Write-Error "Failed to verify Application Gateway deployment after $maxAttempts attempts" + exit 1 + } + + Write-Host "āœ“ Application Gateway deployment verified successfully" + Write-Host "" + Write-Host "šŸŽ‰ SUCCESS! Grafana is now accessible at: http://$customDomain" + Write-Host "" + Write-Host "Note: Backend health probes may take an additional 2-5 minutes to show as healthy" + Write-Host "You can check status with:" + Write-Host "az network application-gateway show-backend-health \" + Write-Host " --name $appGwName \" + Write-Host " --resource-group ${{ parameters.ResourceGroupName }}" \ No newline at end of file From a216a4986311898180c95cbc38ccd53729719a5d Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Mon, 3 Nov 2025 22:43:10 -0800 Subject: [PATCH 053/133] add application gateway --- eng/deploy-managed-grafana.yml | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/eng/deploy-managed-grafana.yml b/eng/deploy-managed-grafana.yml index 131cb0229..9b464cba6 100644 --- a/eng/deploy-managed-grafana.yml +++ b/eng/deploy-managed-grafana.yml @@ -7,6 +7,9 @@ parameters: type: string - name: GrafanaWorkspaceName type: string +- name: EnableCustomDomain + type: boolean + default: true stages: - stage: ProvisionGrafana @@ -22,3 +25,23 @@ stages: GrafanaResourceGroup: 'monitoring-managed' GrafanaWorkspaceName: ${{ parameters.GrafanaWorkspaceName }} GrafanaLocation: 'westus2' + +- stage: ProvisionApplicationGateway + displayName: 'Provision Application Gateway Custom Domain' + dependsOn: ProvisionGrafana + condition: and(succeeded(), eq(${{ parameters.EnableCustomDomain }}, true)) + jobs: + - job: DeployApplicationGateway + displayName: 'Deploy Azure Application Gateway' + pool: + name: NetCore1ESPool-Internal + demands: ImageOverride -equals 1es-ubuntu-2204 + steps: + - template: /eng/provision-appgw.yaml@self + parameters: + SubscriptionId: ${{ parameters.ServiceConnectionId }} + ServiceConnection: ${{ parameters.ServiceConnectionName }} + ResourceGroupName: 'monitoring-managed' + Environment: ${{ parameters.DeploymentEnvironment }} + GrafanaWorkspaceName: ${{ parameters.GrafanaWorkspaceName }} + Location: 'westus2' From 9ac01296e622107c2f07be6d1d218fc6c04b4bc9 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Mon, 3 Nov 2025 23:20:35 -0800 Subject: [PATCH 054/133] rectify image used for app gateway --- eng/provision-grafana.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eng/provision-grafana.yaml b/eng/provision-grafana.yaml index f86ac268b..60a2c1bf5 100644 --- a/eng/provision-grafana.yaml +++ b/eng/provision-grafana.yaml @@ -26,7 +26,7 @@ jobs: displayName: 'Provision Azure Managed Grafana' pool: name: NetCore1ESPool-Internal - demands: ImageOverride -equals 1es-windows-2022 + demands: ImageOverride -equals 1es-ubuntu-2204 steps: - checkout: self From 2b36a31ec277075e74dfceee659e9d1b599c4e35 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Tue, 4 Nov 2025 08:26:59 -0800 Subject: [PATCH 055/133] rectify image used for app gateway --- eng/deploy-managed-grafana.yml | 2 +- eng/provision-grafana.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/eng/deploy-managed-grafana.yml b/eng/deploy-managed-grafana.yml index 9b464cba6..8cb697235 100644 --- a/eng/deploy-managed-grafana.yml +++ b/eng/deploy-managed-grafana.yml @@ -35,7 +35,7 @@ stages: displayName: 'Deploy Azure Application Gateway' pool: name: NetCore1ESPool-Internal - demands: ImageOverride -equals 1es-ubuntu-2204 + demands: ImageOverride -equals 1es-windows-2022 steps: - template: /eng/provision-appgw.yaml@self parameters: diff --git a/eng/provision-grafana.yaml b/eng/provision-grafana.yaml index 60a2c1bf5..f86ac268b 100644 --- a/eng/provision-grafana.yaml +++ b/eng/provision-grafana.yaml @@ -26,7 +26,7 @@ jobs: displayName: 'Provision Azure Managed Grafana' pool: name: NetCore1ESPool-Internal - demands: ImageOverride -equals 1es-ubuntu-2204 + demands: ImageOverride -equals 1es-windows-2022 steps: - checkout: self From b988c493ee29421ff27c8f0386589279df35975f Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Tue, 4 Nov 2025 09:26:48 -0800 Subject: [PATCH 056/133] remove unused service connection id and rectify subscription --- azure-pipelines-managed-grafana.yml | 1 - eng/deploy-managed-grafana.yml | 3 --- eng/provision-appgw.yaml | 3 --- 3 files changed, 7 deletions(-) diff --git a/azure-pipelines-managed-grafana.yml b/azure-pipelines-managed-grafana.yml index 67d8d3d25..15e31e9dc 100644 --- a/azure-pipelines-managed-grafana.yml +++ b/azure-pipelines-managed-grafana.yml @@ -30,7 +30,6 @@ extends: - template: /eng/deploy-managed-grafana.yml@self parameters: ServiceConnectionName: 'Dotnet Engineering services' - ServiceConnectionId: dd8c2cfc-b9c9-452c-a168-ccd4240ada55 ${{ if ne(variables['Build.SourceBranch'], 'refs/heads/production') }}: DeploymentEnvironment: Staging GrafanaWorkspaceName: dnceng-grafana-staging diff --git a/eng/deploy-managed-grafana.yml b/eng/deploy-managed-grafana.yml index 8cb697235..4743c93cd 100644 --- a/eng/deploy-managed-grafana.yml +++ b/eng/deploy-managed-grafana.yml @@ -1,8 +1,6 @@ parameters: - name: ServiceConnectionName type: string -- name: ServiceConnectionId - type: string - name: DeploymentEnvironment type: string - name: GrafanaWorkspaceName @@ -39,7 +37,6 @@ stages: steps: - template: /eng/provision-appgw.yaml@self parameters: - SubscriptionId: ${{ parameters.ServiceConnectionId }} ServiceConnection: ${{ parameters.ServiceConnectionName }} ResourceGroupName: 'monitoring-managed' Environment: ${{ parameters.DeploymentEnvironment }} diff --git a/eng/provision-appgw.yaml b/eng/provision-appgw.yaml index d757762e8..4d2230c4c 100644 --- a/eng/provision-appgw.yaml +++ b/eng/provision-appgw.yaml @@ -1,6 +1,4 @@ parameters: - - name: SubscriptionId - type: string - name: ServiceConnection type: string - name: ResourceGroupName @@ -61,7 +59,6 @@ steps: inputs: deploymentScope: 'Resource Group' azureResourceManagerConnection: ${{ parameters.ServiceConnection }} - subscriptionId: ${{ parameters.SubscriptionId }} action: 'Create Or Update Resource Group' resourceGroupName: ${{ parameters.ResourceGroupName }} location: ${{ parameters.Location }} From ac504aad4177ff96d89abd363ccd0b3313ad30d6 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Wed, 5 Nov 2025 16:57:59 -0800 Subject: [PATCH 057/133] Enable HTTPS on Application Gateway --- eng/deployment/azure-appgw-grafana.bicep | 43 ++++- eng/generate-appgw-cert.ps1 | 204 +++++++++++++++++++++++ eng/provision-appgw.yaml | 70 +++++++- eng/provision-grafana.yaml | 12 ++ 4 files changed, 317 insertions(+), 12 deletions(-) create mode 100644 eng/generate-appgw-cert.ps1 diff --git a/eng/deployment/azure-appgw-grafana.bicep b/eng/deployment/azure-appgw-grafana.bicep index 370da85fc..7d5786c6d 100644 --- a/eng/deployment/azure-appgw-grafana.bicep +++ b/eng/deployment/azure-appgw-grafana.bicep @@ -34,6 +34,12 @@ param resourceTags object = { Service: 'DncEng' } +@description('Key Vault secret ID for the SSL certificate. This is a URI/URL, not sensitive data.') +param certSecretId string = '' + +@description('Resource ID of the user-assigned managed identity created during Grafana provisioning') +param grafanaUserAssignedIdentityId string + // Generate custom domain name based on environment and region // Format: dnceng-managed-grafana[-staging].{region}.cloudapp.azure.com var regionShortName = location == 'westus2' ? 'westus2' : location @@ -46,12 +52,13 @@ var publicIpName = environment == 'Production' ? 'dnceng-grafana-pip' : 'dnceng- var vnetName = environment == 'Production' ? 'dnceng-grafana-vnet' : 'dnceng-grafana-staging-vnet' var subnetName = 'appgw-subnet' var backendPoolName = 'grafana-backend-pool' -var frontendPortName = 'http-port' +var frontendPortName = 'https-port' var frontendIpConfigName = 'appgw-frontend-ip' var httpSettingName = 'grafana-http-setting' -var listenerName = 'http-listener' -var ruleName = 'grafana-routing-rule' +var listenerName = 'https-listener' +var ruleName = 'https-routing-rule' var probeName = 'grafana-health-probe' +var sslCertificateName = 'appgw-ssl-cert' // Virtual Network for Application Gateway resource vnet 'Microsoft.Network/virtualNetworks@2023-05-01' = { @@ -77,7 +84,7 @@ resource vnet 'Microsoft.Network/virtualNetworks@2023-05-01' = { } } -// Public IP for Application Gateway with custom DNS label (creates cloudapp.azure.com domain) +// Public IP for Application Gateway with custom DNS label resource publicIp 'Microsoft.Network/publicIPAddresses@2023-05-01' = { name: publicIpName location: location @@ -101,6 +108,12 @@ resource applicationGateway 'Microsoft.Network/applicationGateways@2023-05-01' = name: appGwName location: location tags: resourceTags + identity: { + type: 'UserAssigned' + userAssignedIdentities: { + '${grafanaUserAssignedIdentityId}': {} + } + } properties: { sku: { name: skuName @@ -131,7 +144,7 @@ resource applicationGateway 'Microsoft.Network/applicationGateways@2023-05-01' = { name: frontendPortName properties: { - port: 80 + port: 443 } } ] @@ -147,6 +160,14 @@ resource applicationGateway 'Microsoft.Network/applicationGateways@2023-05-01' = } } ] + sslCertificates: certSecretId != '' ? [ + { + name: sslCertificateName + properties: { + keyVaultSecretId: certSecretId + } + } + ] : [] backendHttpSettingsCollection: [ { name: httpSettingName @@ -172,8 +193,11 @@ resource applicationGateway 'Microsoft.Network/applicationGateways@2023-05-01' = frontendPort: { id: resourceId('Microsoft.Network/applicationGateways/frontendPorts', appGwName, frontendPortName) } - protocol: 'Http' + protocol: 'Https' requireServerNameIndication: false + sslCertificate: certSecretId != '' ? { + id: resourceId('Microsoft.Network/applicationGateways/sslCertificates', appGwName, sslCertificateName) + } : null } } ] @@ -221,13 +245,14 @@ resource applicationGateway 'Microsoft.Network/applicationGateways@2023-05-01' = // Outputs output applicationGatewayId string = applicationGateway.id output applicationGatewayName string = applicationGateway.name +output applicationGatewayIdentity string = applicationGateway.identity.userAssignedIdentities[grafanaUserAssignedIdentityId].principalId output publicIpAddress string = publicIp.properties.ipAddress output publicDnsLabel string = publicDnsLabel output customDomainName string = customDomainName -output customDomainUrl string = 'http://${customDomainName}' +output customDomainUrl string = 'https://${customDomainName}' output vnetId string = vnet.id output vnetName string = vnet.name // Usage instructions -output usageInstructions string = 'Access Grafana at: http://${customDomainName} (Application Gateway proxies HTTP to HTTPS backend)' -output accessUrl string = 'http://${customDomainName}' +output usageInstructions string = 'Access Grafana at: https://${customDomainName}' +output accessUrl string = 'https://${customDomainName}' diff --git a/eng/generate-appgw-cert.ps1 b/eng/generate-appgw-cert.ps1 new file mode 100644 index 000000000..0f7c68e80 --- /dev/null +++ b/eng/generate-appgw-cert.ps1 @@ -0,0 +1,204 @@ +#!/usr/bin/env pwsh +<# +.SYNOPSIS + Generate or retrieve SSL certificate from Azure Key Vault for Application Gateway +.DESCRIPTION + Creates a self-signed certificate in Azure Key Vault for the cloudapp.azure.com custom domain. + If the certificate already exists, it retrieves the secret URI. + Application Gateway references the certificate directly from Key Vault via managed identity. +.PARAMETER DnsName + The DNS name for the certificate (e.g., dnceng-managed-grafana-staging.westus2.cloudapp.azure.com) +.PARAMETER KeyVaultName + The name of the Azure Key Vault to store the certificate +.PARAMETER CertificateName + The name of the certificate in Key Vault (default: appgw-ssl-cert) +.PARAMETER ResourceGroupName + The resource group name for the Key Vault +.EXAMPLE + .\generate-appgw-cert.ps1 -DnsName "dnceng-managed-grafana-staging.westus2.cloudapp.azure.com" -KeyVaultName "dnceng-kv" -ResourceGroupName "monitoring-managed" +#> + +param( + [Parameter(Mandatory = $true)] + [string]$DnsName, + + [Parameter(Mandatory = $true)] + [string]$KeyVaultName, + + [Parameter(Mandatory = $false)] + [string]$CertificateName = "appgw-ssl-cert", + + [Parameter(Mandatory = $true)] + [string]$ResourceGroupName, + + [Parameter(Mandatory = $false)] + [string]$Location = "westus2" +) + +$ErrorActionPreference = "Stop" + +Write-Host "================================================" -ForegroundColor Cyan +Write-Host "Azure Key Vault Certificate Setup" -ForegroundColor Cyan +Write-Host "================================================" -ForegroundColor Cyan +Write-Host "" +Write-Host "DNS Name: $DnsName" -ForegroundColor White +Write-Host "Key Vault: $KeyVaultName" -ForegroundColor White +Write-Host "Certificate: $CertificateName" -ForegroundColor White +Write-Host "Resource Group: $ResourceGroupName" -ForegroundColor White +Write-Host "" + +# Check if Key Vault exists (should already exist from Grafana provisioning) +Write-Host "Verifying Key Vault exists..." -ForegroundColor Yellow +$kvExists = az keyvault show --name $KeyVaultName --resource-group $ResourceGroupName 2>$null + +if (!$kvExists) { + Write-Error "Key Vault '$KeyVaultName' not found. It should have been created during Grafana provisioning." + Write-Host "Expected Key Vault names:" -ForegroundColor Yellow + Write-Host " Production: dnceng-amg-prod-kv" -ForegroundColor White + Write-Host " Staging: dnceng-amg-int-kv" -ForegroundColor White + exit 1 +} + +Write-Host "āœ“ Key Vault exists (from Grafana provisioning)" -ForegroundColor Green + +# Check if certificate already exists +Write-Host "" +Write-Host "Checking if certificate exists in Key Vault..." -ForegroundColor Yellow +$certExists = az keyvault certificate show ` + --vault-name $KeyVaultName ` + --name $CertificateName ` + --query "id" ` + --output tsv 2>$null + +if ($certExists) { + Write-Host "āœ“ Certificate '$CertificateName' already exists" -ForegroundColor Green + Write-Host " Using existing certificate" -ForegroundColor White +} else { + Write-Host "Certificate not found. Creating self-signed certificate..." -ForegroundColor Yellow + + # Create certificate policy for self-signed cert + $policy = @" +{ + "issuerParameters": { + "name": "Self" + }, + "x509CertificateProperties": { + "subject": "CN=$DnsName", + "subjectAlternativeNames": { + "dnsNames": ["$DnsName"] + }, + "validityInMonths": 12, + "keyUsage": [ + "digitalSignature", + "keyEncipherment" + ], + "ekus": [ + "1.3.6.1.5.5.7.3.1" + ] + }, + "keyProperties": { + "exportable": true, + "keyType": "RSA", + "keySize": 2048, + "reuseKey": false + }, + "secretProperties": { + "contentType": "application/x-pkcs12" + } +} +"@ + + $policyFile = Join-Path $env:TEMP "cert-policy-$([Guid]::NewGuid()).json" + $policy | Out-File -FilePath $policyFile -Encoding UTF8 + + # Create certificate in Key Vault + Write-Host "Creating certificate in Key Vault (this may take 10-15 seconds)..." -ForegroundColor Yellow + + az keyvault certificate create ` + --vault-name $KeyVaultName ` + --name $CertificateName ` + --policy "@$policyFile" ` + --output none + + if ($LASTEXITCODE -ne 0) { + Write-Error "Failed to create certificate in Key Vault" + Remove-Item $policyFile -Force + exit 1 + } + + Remove-Item $policyFile -Force + + Write-Host "āœ“ Self-signed certificate created successfully" -ForegroundColor Green +} + +# Get certificate secret ID (for Application Gateway) +Write-Host "" +Write-Host "Retrieving certificate secret ID..." -ForegroundColor Yellow + +$secretId = az keyvault certificate show ` + --vault-name $KeyVaultName ` + --name $CertificateName ` + --query "sid" ` + --output tsv + +if ([string]::IsNullOrEmpty($secretId)) { + Write-Error "Failed to retrieve certificate secret ID" + exit 1 +} + +# Get unversioned secret ID (recommended for App Gateway) +$unversionedSecretId = $secretId -replace '/[^/]+$', '' + +Write-Host "āœ“ Certificate secret ID retrieved" -ForegroundColor Green +Write-Host "" +Write-Host "================================================" -ForegroundColor Cyan +Write-Host "Certificate Details" -ForegroundColor Cyan +Write-Host "================================================" -ForegroundColor Cyan +Write-Host "" +Write-Host "Secret ID (versioned):" -ForegroundColor White +Write-Host " $secretId" -ForegroundColor Gray +Write-Host "" +Write-Host "Secret ID (unversioned - recommended):" -ForegroundColor White +Write-Host " $unversionedSecretId" -ForegroundColor Gray +Write-Host "" + +# Get certificate details +$certDetails = az keyvault certificate show ` + --vault-name $KeyVaultName ` + --name $CertificateName ` + --output json | ConvertFrom-Json + +$thumbprint = $certDetails.x509Thumbprint +$expiryDate = $certDetails.attributes.expires +$issuer = $certDetails.policy.issuerParameters.name + +Write-Host "Thumbprint: $thumbprint" -ForegroundColor White +Write-Host "Issuer: $issuer" -ForegroundColor White +Write-Host "Expires: $expiryDate" -ForegroundColor White +Write-Host "" + +Write-Host "================================================" -ForegroundColor Cyan +Write-Host "Next Steps" -ForegroundColor Cyan +Write-Host "================================================" -ForegroundColor Cyan +Write-Host "" +Write-Host "1. Grant Application Gateway access to Key Vault" -ForegroundColor Yellow +Write-Host " - Enable managed identity on Application Gateway" -ForegroundColor White +Write-Host " - Grant 'Get' permission on secrets to the identity" -ForegroundColor White +Write-Host "" +Write-Host "2. Use the unversioned secret ID in Bicep template" -ForegroundColor Yellow +Write-Host " - This allows automatic certificate rotation" -ForegroundColor White +Write-Host "" +Write-Host "āš ļø Self-signed certificate notes:" -ForegroundColor Yellow +Write-Host " - Browser will show security warning" -ForegroundColor White +Write-Host " - Valid for 12 months" -ForegroundColor White +Write-Host " - For production, replace with CA-signed certificate" -ForegroundColor White +Write-Host "" + +# Output for pipeline use +Write-Host "Setting pipeline variables..." -ForegroundColor Yellow +Write-Host "##vso[task.setvariable variable=KeyVaultSecretId]$unversionedSecretId" +Write-Host "##vso[task.setvariable variable=CertificateThumbprint]$thumbprint" +Write-Host "##vso[task.setvariable variable=KeyVaultName]$KeyVaultName" + +Write-Host "" +Write-Host "āœ“ Certificate setup complete!" -ForegroundColor Green diff --git a/eng/provision-appgw.yaml b/eng/provision-appgw.yaml index 4d2230c4c..de70fb74b 100644 --- a/eng/provision-appgw.yaml +++ b/eng/provision-appgw.yaml @@ -10,6 +10,8 @@ parameters: - name: Location type: string default: 'westus2' + - name: GrafanaIdentityId + type: string steps: - task: AzureCLI@2 @@ -54,6 +56,19 @@ steps: Write-Host "##vso[task.setvariable variable=GrafanaEndpoint]$grafanaEndpoint" Write-Host "##vso[task.setvariable variable=GrafanaEndpointFull]$grafanaEndpointFull" + - task: AzureCLI@2 + displayName: 'Setup Certificate in Key Vault' + inputs: + azureSubscription: ${{ parameters.ServiceConnection }} + scriptType: 'pscore' + scriptLocation: 'filePath' + scriptPath: 'eng/generate-appgw-cert.ps1' + arguments: >- + -DnsName "dnceng-managed-grafana${{ eq(parameters.Environment, 'Staging') && '-staging' || '' }}.${{ parameters.Location }}.cloudapp.azure.com" + -KeyVaultName "${{ eq(parameters.Environment, 'Production') && 'dnceng-amg-prod-kv' || 'dnceng-amg-int-kv' }}" + -ResourceGroupName "${{ parameters.ResourceGroupName }}" + -Location "${{ parameters.Location }}" + - task: AzureResourceManagerTemplateDeployment@3 displayName: 'Deploy Application Gateway for Grafana' inputs: @@ -68,9 +83,41 @@ steps: -environment "${{ parameters.Environment }}" -location "${{ parameters.Location }}" -grafanaEndpoint "$(GrafanaEndpoint)" + -certSecretId "$(KeyVaultSecretId)" + -grafanaUserAssignedIdentityId "${{ parameters.GrafanaIdentityId }}" deploymentMode: 'Incremental' deploymentOutputs: 'AppGatewayOutputs' + - task: AzureCLI@2 + displayName: 'Grant Application Gateway Access to Key Vault' + inputs: + azureSubscription: ${{ parameters.ServiceConnection }} + scriptType: 'pscore' + scriptLocation: 'inlineScript' + inlineScript: | + Write-Host "Granting Application Gateway managed identity access to Key Vault..." + + $outputs = '$(AppGatewayOutputs)' | ConvertFrom-Json + $appGwIdentity = $outputs.applicationGatewayIdentity.value + $kvName = "$(KeyVaultName)" + + Write-Host "Application Gateway Identity: $appGwIdentity" + Write-Host "Key Vault: $kvName" + + # Grant Get permission on secrets + az keyvault set-policy ` + --name $kvName ` + --object-id $appGwIdentity ` + --secret-permissions get ` + --output none + + if ($LASTEXITCODE -ne 0) { + Write-Error "Failed to grant Key Vault access" + exit 1 + } + + Write-Host "āœ“ Application Gateway can now access certificates from Key Vault" + - task: AzureCLI@2 displayName: 'Display Custom Domain Information' inputs: @@ -103,6 +150,10 @@ steps: Write-Host "šŸŽ‰ Your Grafana is immediately accessible at:" Write-Host " $accessUrl" Write-Host "" + Write-Host "šŸ”’ HTTPS-only configuration" + Write-Host " Certificate managed in Azure Key Vault" + Write-Host " Self-signed certificate (browser will show security warning)" + Write-Host "" Write-Host "The cloudapp.azure.com domain is automatically configured!" Write-Host "No DNS records needed - the domain works right away." Write-Host "" @@ -114,8 +165,20 @@ steps: Write-Host "================================================" Write-Host "1. Wait 2-5 minutes for Application Gateway to become healthy" Write-Host "2. Access Grafana at: $accessUrl" - Write-Host "3. HTTP traffic will be proxied to HTTPS backend" - Write-Host "4. Health probe monitors: $(GrafanaEndpointFull)/api/health" + Write-Host "3. Accept the self-signed certificate warning in your browser" + Write-Host "4. HTTPS traffic is proxied to Grafana HTTPS backend" + Write-Host "5. Health probe monitors: $(GrafanaEndpointFull)/api/health" + Write-Host "" + Write-Host "šŸ”‘ Certificate managed in Key Vault: $(KeyVaultName)" + Write-Host " Thumbprint: $(CertificateThumbprint)" + Write-Host "" + Write-Host "āš ļø HTTP (port 80) is DISABLED - only HTTPS is supported" + Write-Host "" + Write-Host "āš ļø For production, replace the self-signed certificate:" + Write-Host " 1. Import CA-signed certificate to Key Vault:" + Write-Host " az keyvault certificate import --vault-name $(KeyVaultName) \" + Write-Host " --name appgw-ssl-cert --file certificate.pfx" + Write-Host " 2. Application Gateway auto-updates (no redeployment needed)" Write-Host "" Write-Host "The domain $customDomain is ready to use!" Write-Host "" @@ -187,7 +250,8 @@ steps: Write-Host "āœ“ Application Gateway deployment verified successfully" Write-Host "" - Write-Host "šŸŽ‰ SUCCESS! Grafana is now accessible at: http://$customDomain" + Write-Host "šŸŽ‰ SUCCESS! Grafana is now accessible at: https://$customDomain" + Write-Host " (HTTPS only - HTTP disabled)" Write-Host "" Write-Host "Note: Backend health probes may take an additional 2-5 minutes to show as healthy" Write-Host "You can check status with:" diff --git a/eng/provision-grafana.yaml b/eng/provision-grafana.yaml index f86ac268b..1fe1de998 100644 --- a/eng/provision-grafana.yaml +++ b/eng/provision-grafana.yaml @@ -79,6 +79,7 @@ jobs: - task: AzureResourceManagerTemplateDeployment@3 displayName: 'Deploy Grafana Workspace' + name: DeployGrafana inputs: deploymentScope: 'Resource Group' azureResourceManagerConnection: '${{ parameters.ServiceConnectionName }}' @@ -92,6 +93,17 @@ jobs: deploymentName: 'grafana-${{ parameters.DeploymentEnvironment }}-$(Build.BuildNumber)' deploymentOutputs: 'grafanaOutputs' + - task: PowerShell@2 + displayName: 'Export Grafana Identity for App Gateway' + name: ExportIdentity + inputs: + targetType: 'inline' + script: | + $outputs = '$(grafanaOutputs)' | ConvertFrom-Json + $identityId = $outputs.grafanaUserAssignedIdentityId.value + Write-Host "Grafana User-Assigned Identity ID: $identityId" + Write-Host "##vso[task.setvariable variable=GrafanaIdentityId;isOutput=true]$identityId" + - task: AzureCLI@2 displayName: 'Install Azure Managed Grafana Extension' inputs: From 5445e5806bdc998a9a16e18d7f07f9fea197eb89 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Wed, 5 Nov 2025 17:00:17 -0800 Subject: [PATCH 058/133] Enable HTTPS on Application Gateway --- eng/deploy-managed-grafana.yml | 3 +++ eng/generate-appgw-cert.ps1 | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/eng/deploy-managed-grafana.yml b/eng/deploy-managed-grafana.yml index 4743c93cd..69416fb10 100644 --- a/eng/deploy-managed-grafana.yml +++ b/eng/deploy-managed-grafana.yml @@ -28,6 +28,8 @@ stages: displayName: 'Provision Application Gateway Custom Domain' dependsOn: ProvisionGrafana condition: and(succeeded(), eq(${{ parameters.EnableCustomDomain }}, true)) + variables: + GrafanaIdentityId: $[ stageDependencies.ProvisionGrafana.ProvisionGrafana.outputs['ExportIdentity.GrafanaIdentityId'] ] jobs: - job: DeployApplicationGateway displayName: 'Deploy Azure Application Gateway' @@ -42,3 +44,4 @@ stages: Environment: ${{ parameters.DeploymentEnvironment }} GrafanaWorkspaceName: ${{ parameters.GrafanaWorkspaceName }} Location: 'westus2' + GrafanaIdentityId: $(GrafanaIdentityId) diff --git a/eng/generate-appgw-cert.ps1 b/eng/generate-appgw-cert.ps1 index 0f7c68e80..13a5b414c 100644 --- a/eng/generate-appgw-cert.ps1 +++ b/eng/generate-appgw-cert.ps1 @@ -188,7 +188,7 @@ Write-Host "" Write-Host "2. Use the unversioned secret ID in Bicep template" -ForegroundColor Yellow Write-Host " - This allows automatic certificate rotation" -ForegroundColor White Write-Host "" -Write-Host "āš ļø Self-signed certificate notes:" -ForegroundColor Yellow +Write-Host " Self-signed certificate notes:" -ForegroundColor Yellow Write-Host " - Browser will show security warning" -ForegroundColor White Write-Host " - Valid for 12 months" -ForegroundColor White Write-Host " - For production, replace with CA-signed certificate" -ForegroundColor White From b106927ae8487a5f89529ecdb131bde314e12e20 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Wed, 5 Nov 2025 17:04:02 -0800 Subject: [PATCH 059/133] remove incorrect operator --- eng/provision-appgw.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/eng/provision-appgw.yaml b/eng/provision-appgw.yaml index de70fb74b..0d012b660 100644 --- a/eng/provision-appgw.yaml +++ b/eng/provision-appgw.yaml @@ -64,8 +64,8 @@ steps: scriptLocation: 'filePath' scriptPath: 'eng/generate-appgw-cert.ps1' arguments: >- - -DnsName "dnceng-managed-grafana${{ eq(parameters.Environment, 'Staging') && '-staging' || '' }}.${{ parameters.Location }}.cloudapp.azure.com" - -KeyVaultName "${{ eq(parameters.Environment, 'Production') && 'dnceng-amg-prod-kv' || 'dnceng-amg-int-kv' }}" + -DnsName "dnceng-managed-grafana${{ eq(parameters.Environment, 'Staging') == true && '-staging' || '' }}.${{ parameters.Location }}.cloudapp.azure.com" + -KeyVaultName "${{ eq(parameters.Environment, 'Production') == true && 'dnceng-amg-prod-kv' || 'dnceng-amg-int-kv' }}" -ResourceGroupName "${{ parameters.ResourceGroupName }}" -Location "${{ parameters.Location }}" @@ -169,12 +169,12 @@ steps: Write-Host "4. HTTPS traffic is proxied to Grafana HTTPS backend" Write-Host "5. Health probe monitors: $(GrafanaEndpointFull)/api/health" Write-Host "" - Write-Host "šŸ”‘ Certificate managed in Key Vault: $(KeyVaultName)" + Write-Host " Certificate managed in Key Vault: $(KeyVaultName)" Write-Host " Thumbprint: $(CertificateThumbprint)" Write-Host "" - Write-Host "āš ļø HTTP (port 80) is DISABLED - only HTTPS is supported" + Write-Host " HTTP (port 80) is DISABLED - only HTTPS is supported" Write-Host "" - Write-Host "āš ļø For production, replace the self-signed certificate:" + Write-Host " For production, replace the self-signed certificate:" Write-Host " 1. Import CA-signed certificate to Key Vault:" Write-Host " az keyvault certificate import --vault-name $(KeyVaultName) \" Write-Host " --name appgw-ssl-cert --file certificate.pfx" From 17c1ba7449f9a7afc2afbb4f50506dd0549e8b8f Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Wed, 5 Nov 2025 17:05:23 -0800 Subject: [PATCH 060/133] remove incorrect operator --- eng/provision-appgw.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/eng/provision-appgw.yaml b/eng/provision-appgw.yaml index 0d012b660..55cb65217 100644 --- a/eng/provision-appgw.yaml +++ b/eng/provision-appgw.yaml @@ -64,8 +64,8 @@ steps: scriptLocation: 'filePath' scriptPath: 'eng/generate-appgw-cert.ps1' arguments: >- - -DnsName "dnceng-managed-grafana${{ eq(parameters.Environment, 'Staging') == true && '-staging' || '' }}.${{ parameters.Location }}.cloudapp.azure.com" - -KeyVaultName "${{ eq(parameters.Environment, 'Production') == true && 'dnceng-amg-prod-kv' || 'dnceng-amg-int-kv' }}" + -DnsName "dnceng-managed-grafana${{ if(eq(parameters.Environment, 'Staging'), '-staging', '') }}.${{ parameters.Location }}.cloudapp.azure.com" + -KeyVaultName "${{ if(eq(parameters.Environment, 'Production'), 'dnceng-amg-prod-kv', 'dnceng-amg-int-kv') }}" -ResourceGroupName "${{ parameters.ResourceGroupName }}" -Location "${{ parameters.Location }}" From acc362b0038563956de4dbe5572c9a2f51b7f258 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Wed, 5 Nov 2025 17:09:18 -0800 Subject: [PATCH 061/133] remove incorrect operator --- eng/provision-appgw.yaml | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/eng/provision-appgw.yaml b/eng/provision-appgw.yaml index 55cb65217..b555dff59 100644 --- a/eng/provision-appgw.yaml +++ b/eng/provision-appgw.yaml @@ -63,11 +63,18 @@ steps: scriptType: 'pscore' scriptLocation: 'filePath' scriptPath: 'eng/generate-appgw-cert.ps1' - arguments: >- - -DnsName "dnceng-managed-grafana${{ if(eq(parameters.Environment, 'Staging'), '-staging', '') }}.${{ parameters.Location }}.cloudapp.azure.com" - -KeyVaultName "${{ if(eq(parameters.Environment, 'Production'), 'dnceng-amg-prod-kv', 'dnceng-amg-int-kv') }}" - -ResourceGroupName "${{ parameters.ResourceGroupName }}" - -Location "${{ parameters.Location }}" + ${{ if eq(parameters.Environment, 'Staging') }}: + arguments: >- + -DnsName "dnceng-managed-grafana-staging.${{ parameters.Location }}.cloudapp.azure.com" + -KeyVaultName "dnceng-amg-int-kv" + -ResourceGroupName "${{ parameters.ResourceGroupName }}" + -Location "${{ parameters.Location }}" + ${{ if eq(parameters.Environment, 'Production') }}: + arguments: >- + -DnsName "dnceng-managed-grafana.${{ parameters.Location }}.cloudapp.azure.com" + -KeyVaultName "dnceng-amg-prod-kv" + -ResourceGroupName "${{ parameters.ResourceGroupName }}" + -Location "${{ parameters.Location }}" - task: AzureResourceManagerTemplateDeployment@3 displayName: 'Deploy Application Gateway for Grafana' From 11618a79497c9e50e5448bec648b570fba118465 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Wed, 5 Nov 2025 17:47:00 -0800 Subject: [PATCH 062/133] fix script path --- eng/provision-appgw.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eng/provision-appgw.yaml b/eng/provision-appgw.yaml index b555dff59..b69ca26ac 100644 --- a/eng/provision-appgw.yaml +++ b/eng/provision-appgw.yaml @@ -61,7 +61,7 @@ steps: inputs: azureSubscription: ${{ parameters.ServiceConnection }} scriptType: 'pscore' - scriptLocation: 'filePath' + scriptLocation: 'scriptPath' scriptPath: 'eng/generate-appgw-cert.ps1' ${{ if eq(parameters.Environment, 'Staging') }}: arguments: >- From 916faf5ce34ce7127eba116cfab3549ba99be87d Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Wed, 5 Nov 2025 20:30:53 -0800 Subject: [PATCH 063/133] grant the grafana MI Key Vault Certificates Officer role --- eng/deployment/azure-managed-grafana.bicep | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/eng/deployment/azure-managed-grafana.bicep b/eng/deployment/azure-managed-grafana.bicep index 3d79d4f94..47a0074d8 100644 --- a/eng/deployment/azure-managed-grafana.bicep +++ b/eng/deployment/azure-managed-grafana.bicep @@ -78,6 +78,7 @@ resource grafanaKeyVault 'Microsoft.KeyVault/vaults@2023-07-01' = { // Define Key Vault role IDs var keyVaultSecretsOfficerRoleId = 'b86a8fe4-44ce-4948-aee5-eccb2c155cd7' +var keyVaultCertificatesOfficerRoleId = 'a4417e6f-fecd-4de8-b567-7b0420556985' var readerRoleId = 'acdd72a7-3385-48ef-bd42-f606fba81ae7' var keyVaultCertificateUserRoleId = 'db79e9a7-68ee-4b58-9aeb-b90e7c24fcba' var keyVaultCryptoUserRoleId = '12338af0-0e69-4776-bea7-57ae8d297424' @@ -118,6 +119,17 @@ resource grafanaKeyVaultCertificateUserRole 'Microsoft.Authorization/roleAssignm } } +// Grant Key Vault Certificates Officer role to Grafana managed identity +resource grafanaKeyVaultCertificatesOfficerRole 'Microsoft.Authorization/roleAssignments@2022-04-01' = { + name: guid(grafanaKeyVault.id, grafanaUserAssignedIdentity.id, keyVaultCertificatesOfficerRoleId) + scope: grafanaKeyVault + properties: { + roleDefinitionId: subscriptionResourceId('Microsoft.Authorization/roleDefinitions', keyVaultCertificatesOfficerRoleId) + principalId: grafanaUserAssignedIdentity.properties.principalId + principalType: 'ServicePrincipal' + } +} + // Grant Key Vault Crypto User role to Grafana managed identity resource grafanaKeyVaultCryptoUserRole 'Microsoft.Authorization/roleAssignments@2022-04-01' = { name: guid(grafanaKeyVault.id, grafanaUserAssignedIdentity.id, keyVaultCryptoUserRoleId) From c8eec927c74298a2f56d7ab56c0398dffb91864c Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Wed, 5 Nov 2025 21:45:59 -0800 Subject: [PATCH 064/133] grant pipeline service principal Key Vault Certificates Officer role --- eng/provision-appgw.yaml | 49 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/eng/provision-appgw.yaml b/eng/provision-appgw.yaml index b69ca26ac..dbbfd3a88 100644 --- a/eng/provision-appgw.yaml +++ b/eng/provision-appgw.yaml @@ -56,6 +56,55 @@ steps: Write-Host "##vso[task.setvariable variable=GrafanaEndpoint]$grafanaEndpoint" Write-Host "##vso[task.setvariable variable=GrafanaEndpointFull]$grafanaEndpointFull" + - task: AzureCLI@2 + displayName: 'Grant Pipeline Service Principal Key Vault Certificate Access' + inputs: + azureSubscription: ${{ parameters.ServiceConnection }} + scriptType: 'pscore' + scriptLocation: 'inlineScript' + inlineScript: | + Write-Host "Granting pipeline service principal Key Vault Certificates Officer role..." + + $kvName = if ("${{ parameters.Environment }}" -eq "Production") { "dnceng-amg-prod-kv" } else { "dnceng-amg-int-kv" } + $rgName = "${{ parameters.ResourceGroupName }}" + + # Get the current service principal object ID + $spObjectId = az account show --query "user.name" --output tsv + Write-Host "Service Principal Object ID: $spObjectId" + + # Get the Key Vault resource ID + $kvId = az keyvault show --name $kvName --resource-group $rgName --query "id" --output tsv + Write-Host "Key Vault: $kvName" + Write-Host "Key Vault ID: $kvId" + + # Check if role assignment already exists + $existingAssignment = az role assignment list ` + --assignee $spObjectId ` + --scope $kvId ` + --role "Key Vault Certificates Officer" ` + --query "[0].id" ` + --output tsv + + if ($existingAssignment) { + Write-Host "āœ“ Pipeline service principal already has Key Vault Certificates Officer role" + } else { + Write-Host "Granting Key Vault Certificates Officer role..." + az role assignment create ` + --role "Key Vault Certificates Officer" ` + --assignee $spObjectId ` + --scope $kvId ` + --output none + + if ($LASTEXITCODE -eq 0) { + Write-Host "āœ“ Pipeline service principal granted Key Vault Certificates Officer role" + Write-Host "ā± Waiting 30 seconds for role assignment to propagate..." + Start-Sleep -Seconds 30 + } else { + Write-Error "Failed to grant Key Vault Certificates Officer role" + exit 1 + } + } + - task: AzureCLI@2 displayName: 'Setup Certificate in Key Vault' inputs: From a71a2ccd3a3ab4a85e296f795f0e00a30f715472 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Wed, 5 Nov 2025 22:30:41 -0800 Subject: [PATCH 065/133] Grant Application Gateway Access to Key Vault --- eng/provision-appgw.yaml | 41 +++++++++++++++++++++++++++++----------- 1 file changed, 30 insertions(+), 11 deletions(-) diff --git a/eng/provision-appgw.yaml b/eng/provision-appgw.yaml index dbbfd3a88..db2e115b4 100644 --- a/eng/provision-appgw.yaml +++ b/eng/provision-appgw.yaml @@ -160,19 +160,38 @@ steps: Write-Host "Application Gateway Identity: $appGwIdentity" Write-Host "Key Vault: $kvName" - # Grant Get permission on secrets - az keyvault set-policy ` - --name $kvName ` - --object-id $appGwIdentity ` - --secret-permissions get ` - --output none + # Get the Key Vault resource ID + $rgName = "${{ parameters.ResourceGroupName }}" + $kvId = az keyvault show --name $kvName --resource-group $rgName --query "id" --output tsv - if ($LASTEXITCODE -ne 0) { - Write-Error "Failed to grant Key Vault access" - exit 1 - } + Write-Host "Key Vault ID: $kvId" + + # Check if role assignment already exists + $existingAssignment = az role assignment list ` + --assignee $appGwIdentity ` + --scope $kvId ` + --role "Key Vault Secrets User" ` + --query "[0].id" ` + --output tsv - Write-Host "āœ“ Application Gateway can now access certificates from Key Vault" + if ($existingAssignment) { + Write-Host "āœ“ Application Gateway already has Key Vault Secrets User role" + } else { + Write-Host "Granting Key Vault Secrets User role (RBAC)..." + az role assignment create ` + --role "Key Vault Secrets User" ` + --assignee $appGwIdentity ` + --scope $kvId ` + --output none + + if ($LASTEXITCODE -eq 0) { + Write-Host "āœ“ Application Gateway can now access certificates from Key Vault" + } else { + Write-Error "Failed to grant Key Vault access" + exit 1 + } + } + - task: AzureCLI@2 displayName: 'Display Custom Domain Information' From 6d98a297864851fd023809655770ea671ee95993 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Thu, 6 Nov 2025 09:14:12 -0800 Subject: [PATCH 066/133] Fix 502 error: Accept 401 status from Grafana health probe --- eng/deployment/azure-appgw-grafana.bicep | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eng/deployment/azure-appgw-grafana.bicep b/eng/deployment/azure-appgw-grafana.bicep index 7d5786c6d..65816e1fe 100644 --- a/eng/deployment/azure-appgw-grafana.bicep +++ b/eng/deployment/azure-appgw-grafana.bicep @@ -232,7 +232,7 @@ resource applicationGateway 'Microsoft.Network/applicationGateways@2023-05-01' = minServers: 0 match: { statusCodes: [ - '200-399' + '200-401' ] } } From c1f94c3084d4b765416845a3f0ba9abbc185e07d Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Fri, 7 Nov 2025 10:30:19 -0800 Subject: [PATCH 067/133] publish grafana dashboard --- eng/deploy-managed-grafana.yml | 89 +++++++++++ eng/provision-grafana.yaml | 16 ++ eng/setup-grafana-api-token.ps1 | 251 ++++++++++++++++++++++++++++++++ 3 files changed, 356 insertions(+) create mode 100644 eng/setup-grafana-api-token.ps1 diff --git a/eng/deploy-managed-grafana.yml b/eng/deploy-managed-grafana.yml index 69416fb10..38e8c79ef 100644 --- a/eng/deploy-managed-grafana.yml +++ b/eng/deploy-managed-grafana.yml @@ -45,3 +45,92 @@ stages: GrafanaWorkspaceName: ${{ parameters.GrafanaWorkspaceName }} Location: 'westus2' GrafanaIdentityId: $(GrafanaIdentityId) + +- stage: PublishDashboards + displayName: 'Publish Grafana Dashboards' + dependsOn: ProvisionGrafana + variables: + GrafanaEndpoint: $[ stageDependencies.ProvisionGrafana.ProvisionGrafana.outputs['ExportGrafanaInfo.GrafanaEndpoint'] ] + KeyVaultName: $[ stageDependencies.ProvisionGrafana.ProvisionGrafana.outputs['ExportGrafanaInfo.KeyVaultName'] ] + jobs: + - job: PublishDashboards + displayName: 'Publish Dashboards to Azure Managed Grafana' + pool: + name: NetCore1ESPool-Internal + demands: ImageOverride -equals 1es-windows-2022 + steps: + - task: UseDotNet@2 + displayName: 'Install Correct .NET Version' + inputs: + useGlobalJson: true + + - script: dotnet publish --configuration Release $(Build.SourcesDirectory)\src\Monitoring\Sdk\Microsoft.DotNet.Monitoring.Sdk.csproj -f net8.0 + displayName: 'Build Monitoring SDK' + + - task: AzureCLI@2 + displayName: 'Publish Grafana Dashboards' + inputs: + azureSubscription: ${{ parameters.ServiceConnectionName }} + scriptType: 'pscore' + scriptLocation: 'inlineScript' + inlineScript: | + Write-Host "==========================================" + Write-Host "Publishing Dashboards to Azure Managed Grafana" + Write-Host "==========================================" + Write-Host "Grafana Endpoint: $(GrafanaEndpoint)" + Write-Host "Key Vault: $(KeyVaultName)" + Write-Host "Environment: ${{ parameters.DeploymentEnvironment }}" + Write-Host "" + + # Get the API token from Key Vault + $tokenSecretName = "grafana-admin-api-key" + Write-Host "Retrieving API token from Key Vault..." + + $apiToken = az keyvault secret show --vault-name "$(KeyVaultName)" --name $tokenSecretName --query "value" --output tsv + + if (-not $apiToken) { + Write-Error "Failed to retrieve Grafana API token from Key Vault. Please ensure the secret '$tokenSecretName' exists in $(KeyVaultName)" + Write-Host "" + Write-Host "To create the token:" + Write-Host "1. Go to: $(GrafanaEndpoint)" + Write-Host "2. Navigate to: Administration > Service accounts" + Write-Host "3. Create a service account with Admin role" + Write-Host "4. Generate a token" + Write-Host "5. Store in Key Vault: az keyvault secret set --vault-name $(KeyVaultName) --name $tokenSecretName --value ''" + exit 1 + } + + Write-Host "āœ“ API token retrieved successfully" + Write-Host "" + Write-Host "Publishing dashboards using MSBuild SDK..." + Write-Host "" + + # Get service connection details for authentication + $servicePrincipalId = $env:servicePrincipalId + $servicePrincipalKey = $env:servicePrincipalKey + + # Publish using the same MSBuild SDK as self-hosted Grafana + dotnet build $(Build.SourcesDirectory)\src\Monitoring\Monitoring.ArcadeServices\Monitoring.ArcadeServices.proj ` + --configuration Release ` + -t:PublishGrafana ` + -p:GrafanaAccessToken=$apiToken ` + -p:GrafanaHost="$(GrafanaEndpoint)" ` + -p:GrafanaKeyVaultName="$(KeyVaultName)" ` + -p:GrafanaEnvironment="${{ parameters.DeploymentEnvironment }}" ` + -p:ParametersFile=parameters.json ` + -v:normal + + if ($LASTEXITCODE -ne 0) { + Write-Error "Failed to publish dashboards to Grafana" + exit 1 + } + + Write-Host "" + Write-Host "==========================================" + Write-Host "āœ“ SUCCESS! Dashboards Published" + Write-Host "==========================================" + Write-Host "" + Write-Host "View your dashboards at:" + Write-Host "$(GrafanaEndpoint)/dashboards" + Write-Host "" + diff --git a/eng/provision-grafana.yaml b/eng/provision-grafana.yaml index 1fe1de998..47db29e9c 100644 --- a/eng/provision-grafana.yaml +++ b/eng/provision-grafana.yaml @@ -104,6 +104,22 @@ jobs: Write-Host "Grafana User-Assigned Identity ID: $identityId" Write-Host "##vso[task.setvariable variable=GrafanaIdentityId;isOutput=true]$identityId" + - task: PowerShell@2 + displayName: 'Export Grafana Endpoint and Key Vault for Dashboard Publishing' + name: ExportGrafanaInfo + inputs: + targetType: 'inline' + script: | + $outputs = '$(grafanaOutputs)' | ConvertFrom-Json + $endpoint = $outputs.grafanaWorkspaceUrl.value + $keyVaultName = $outputs.keyVaultName.value + + Write-Host "Grafana Endpoint: $endpoint" + Write-Host "Key Vault Name: $keyVaultName" + + Write-Host "##vso[task.setvariable variable=GrafanaEndpoint;isOutput=true]$endpoint" + Write-Host "##vso[task.setvariable variable=KeyVaultName;isOutput=true]$keyVaultName" + - task: AzureCLI@2 displayName: 'Install Azure Managed Grafana Extension' inputs: diff --git a/eng/setup-grafana-api-token.ps1 b/eng/setup-grafana-api-token.ps1 new file mode 100644 index 000000000..263fb7109 --- /dev/null +++ b/eng/setup-grafana-api-token.ps1 @@ -0,0 +1,251 @@ +#!/usr/bin/env pwsh +<# +.SYNOPSIS + Sets up Grafana API token in Key Vault for dashboard publishing +.DESCRIPTION + This script helps you create and store a Grafana API token in Azure Key Vault + for use by the dashboard publishing pipeline. +.PARAMETER Environment + The deployment environment (Staging or Production) +.PARAMETER ApiToken + The Grafana API token (if you already have one) +.EXAMPLE + .\setup-grafana-api-token.ps1 -Environment Staging +.EXAMPLE + .\setup-grafana-api-token.ps1 -Environment Production -ApiToken "glsa_xxx" +#> + +param( + [Parameter(Mandatory=$true)] + [ValidateSet("Staging", "Production")] + [string]$Environment, + + [Parameter(Mandatory=$false)] + [string]$ApiToken +) + +Set-StrictMode -Version Latest +$ErrorActionPreference = "Stop" + +# Determine workspace and Key Vault names +$workspaceName = if ($Environment -eq "Production") { "dnceng-grafana" } else { "dnceng-grafana-staging" } +$resourceGroup = "monitoring-managed" +$keyVaultName = if ($Environment -eq "Production") { "dnceng-amg-prod-kv" } else { "dnceng-amg-int-kv" } +$tokenSecretName = "grafana-admin-api-key" + +Write-Host "==========================================" +Write-Host "Setup Grafana API Token" +Write-Host "==========================================" +Write-Host "Environment: $Environment" +Write-Host "Workspace: $workspaceName" +Write-Host "Key Vault: $keyVaultName" +Write-Host "Secret Name: $tokenSecretName" +Write-Host "" + +# Get Grafana endpoint +Write-Host "Getting Grafana workspace endpoint..." +$grafanaInfo = az grafana show --name $workspaceName --resource-group $resourceGroup --query "{endpoint:properties.endpoint, status:properties.provisioningState}" -o json | ConvertFrom-Json + +if (-not $grafanaInfo -or $grafanaInfo.status -ne "Succeeded") { + Write-Error "Grafana workspace '$workspaceName' is not ready. Status: $($grafanaInfo.status)" + exit 1 +} + +$grafanaEndpoint = $grafanaInfo.endpoint +Write-Host "āœ“ Grafana Endpoint: $grafanaEndpoint" +Write-Host "" + +# Check if token already exists +Write-Host "Checking if API token already exists in Key Vault..." +$existingToken = az keyvault secret show --vault-name $keyVaultName --name $tokenSecretName --query "value" -o tsv 2>$null + +if ($existingToken) { + Write-Host "āœ“ Found existing token in Key Vault" + Write-Host "" + Write-Host "Validating token..." + + # Test if the token is still valid by calling Grafana API + $headers = @{ + "Authorization" = "Bearer $existingToken" + "Content-Type" = "application/json" + } + + try { + # Test the token by getting org info (lightweight API call) + $testResponse = Invoke-RestMethod -Uri "$grafanaEndpoint/api/org" -Method Get -Headers $headers -ErrorAction Stop + Write-Host "āœ“ Token is valid and working!" + Write-Host " Organization: $($testResponse.name)" + Write-Host "" + Write-Host "Using existing token. No need to create a new one." + Write-Host "" + Write-Host "==========================================" + Write-Host "āœ“ Setup Complete!" + Write-Host "==========================================" + Write-Host "" + Write-Host "The existing API token in Key Vault is valid." + Write-Host " Key Vault: $keyVaultName" + Write-Host " Secret: $tokenSecretName" + Write-Host "" + Write-Host "The pipeline can publish dashboards to:" + Write-Host " $grafanaEndpoint" + Write-Host "" + exit 0 + } catch { + Write-Host "⚠ Existing token is invalid or expired" + Write-Host " Error: $($_.Exception.Message)" + Write-Host "" + Write-Host "A new token will be created..." + Write-Host "" + } +} + +# Get API token if not provided +if (-not $ApiToken) { + Write-Host "==========================================" + Write-Host "Automated Service Account Creation" + Write-Host "==========================================" + Write-Host "" + Write-Host "This will automatically create a Grafana service account and token." + Write-Host "Using Azure CLI to authenticate to Grafana..." + Write-Host "" + + # Check if AMG extension is installed + Write-Host "Checking Azure CLI Grafana extension..." + $amgExtension = az extension list --query "[?name=='amg'].version" -o tsv + if (-not $amgExtension) { + Write-Host "Installing Azure Managed Grafana CLI extension..." + az extension add --name amg --only-show-errors + Write-Host "āœ“ Extension installed" + } else { + Write-Host "āœ“ Azure Managed Grafana extension already installed (version $amgExtension)" + } + Write-Host "" + + # Create service account using Azure CLI + Write-Host "Creating service account 'grafana-admin'..." + + $serviceAccountJson = az grafana service-account create ` + --name $workspaceName ` + --resource-group $resourceGroup ` + --service-account "grafana-admin" ` + --role "Admin" ` + -o json 2>&1 + + if ($LASTEXITCODE -ne 0) { + # Check if it already exists + if ($serviceAccountJson -like "*already exists*" -or $serviceAccountJson -like "*409*") { + Write-Host "⚠ Service account 'grafana-admin' already exists, retrieving it..." + + $listJson = az grafana service-account list ` + --name $workspaceName ` + --resource-group $resourceGroup ` + -o json + + $serviceAccounts = $listJson | ConvertFrom-Json + $serviceAccount = $serviceAccounts | Where-Object { $_.name -eq "grafana-admin" } | Select-Object -First 1 + + if (-not $serviceAccount) { + Write-Error "Failed to find existing service account 'grafana-admin'" + exit 1 + } + + $serviceAccountId = $serviceAccount.id + Write-Host "āœ“ Found existing service account with ID: $serviceAccountId" + } else { + Write-Error "Failed to create service account:" + Write-Host $serviceAccountJson + exit 1 + } + } else { + $serviceAccount = $serviceAccountJson | ConvertFrom-Json + $serviceAccountId = $serviceAccount.id + Write-Host "āœ“ Service account created with ID: $serviceAccountId" + } + + Write-Host "" + + # Create service account token (expires in 1 day = 86400 seconds) + Write-Host "Creating service account token (expires in 1 day)..." + + $tokenName = "ci-cd-token-$(Get-Date -Format 'yyyyMMdd-HHmmss')" + + $tokenJson = az grafana service-account token create ` + --name $workspaceName ` + --resource-group $resourceGroup ` + --service-account $serviceAccountId ` + --token $tokenName ` + --time-to-live "1d" ` + -o json + + if ($LASTEXITCODE -ne 0) { + Write-Error "Failed to create service account token:" + Write-Host $tokenJson + exit 1 + } + + $tokenResponse = $tokenJson | ConvertFrom-Json + $ApiToken = $tokenResponse.key + + Write-Host "āœ“ Service account token created" + Write-Host " Token name: $tokenName" + Write-Host " Token ID: $($tokenResponse.id)" + Write-Host " Expires in: 1 day (86400 seconds)" + Write-Host "" +} + +# Validate token format (Grafana service account tokens start with "glsa_") +if (-not $ApiToken.StartsWith("glsa_")) { + Write-Warning "Token doesn't start with 'glsa_' - this might not be a service account token" + $continue = Read-Host "Continue anyway? (y/N)" + if ($continue -ne "y" -and $continue -ne "Y") { + Write-Host "Aborted." + exit 1 + } +} + +# Store in Key Vault +Write-Host "" +Write-Host "Storing API token in Key Vault..." + +try { + az keyvault secret set ` + --vault-name $keyVaultName ` + --name $tokenSecretName ` + --value $ApiToken ` + --output none + + Write-Host "āœ“ Token stored successfully in Key Vault" +} catch { + Write-Error "Failed to store token in Key Vault: $_" + Write-Host "" + Write-Host "Make sure you have the following permissions on the Key Vault:" + Write-Host "- Key Vault Secrets Officer (or Contributor)" + Write-Host "" + Write-Host "You can grant yourself access with:" + Write-Host "az role assignment create --role 'Key Vault Secrets Officer' \" + Write-Host " --assignee \" + Write-Host " --scope /subscriptions//resourceGroups/$resourceGroup/providers/Microsoft.KeyVault/vaults/$keyVaultName" + exit 1 +} + +Write-Host "" +Write-Host "==========================================" +Write-Host "āœ“ Setup Complete!" +Write-Host "==========================================" +Write-Host "" +Write-Host "The API token has been stored in:" +Write-Host " Key Vault: $keyVaultName" +Write-Host " Secret: $tokenSecretName" +Write-Host "" +Write-Host "The pipeline can now publish dashboards to:" +Write-Host " $grafanaEndpoint" +Write-Host "" +Write-Host "To test dashboard publishing locally, run:" +Write-Host " dotnet build src\Monitoring\Monitoring.ArcadeServices\Monitoring.ArcadeServices.proj \" +Write-Host " -t:PublishGrafana \" +Write-Host " -p:GrafanaHost=$grafanaEndpoint \" +Write-Host " -p:GrafanaAccessToken= \" +Write-Host " -p:GrafanaKeyVaultName=$keyVaultName \" +Write-Host " -p:GrafanaEnvironment=$Environment \" +Write-Host " -p:ParametersFile=parameters.json" +Write-Host "" From 59540e61f5cef852d24cbd35481df0cec365db0e Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Sat, 8 Nov 2025 20:07:32 -0800 Subject: [PATCH 068/133] add token creation to publish grafana stage --- azure-pipelines-managed-grafana.yml | 2 + eng/deploy-managed-grafana.yml | 44 ++++++++++++++-------- eng/deployment/azure-managed-grafana.bicep | 5 ++- eng/provision-appgw.yaml | 8 ++-- eng/provision-grafana.yaml | 7 +++- eng/setup-grafana-api-token.ps1 | 11 +++++- 6 files changed, 53 insertions(+), 24 deletions(-) diff --git a/azure-pipelines-managed-grafana.yml b/azure-pipelines-managed-grafana.yml index 15e31e9dc..c5fbb60d4 100644 --- a/azure-pipelines-managed-grafana.yml +++ b/azure-pipelines-managed-grafana.yml @@ -33,6 +33,8 @@ extends: ${{ if ne(variables['Build.SourceBranch'], 'refs/heads/production') }}: DeploymentEnvironment: Staging GrafanaWorkspaceName: dnceng-grafana-staging + GrafanaKeyVault: dnceng-amg-int-kv ${{ else }}: DeploymentEnvironment: Production GrafanaWorkspaceName: dnceng-grafana + GrafanaKeyVault: dnceng-amg-prod-kv diff --git a/eng/deploy-managed-grafana.yml b/eng/deploy-managed-grafana.yml index 38e8c79ef..6ecb2fcf6 100644 --- a/eng/deploy-managed-grafana.yml +++ b/eng/deploy-managed-grafana.yml @@ -5,6 +5,8 @@ parameters: type: string - name: GrafanaWorkspaceName type: string +- name: GrafanaKeyVault + type: string - name: EnableCustomDomain type: boolean default: true @@ -23,6 +25,7 @@ stages: GrafanaResourceGroup: 'monitoring-managed' GrafanaWorkspaceName: ${{ parameters.GrafanaWorkspaceName }} GrafanaLocation: 'westus2' + GrafanaKeyVault: ${{ parameters.GrafanaKeyVault }} - stage: ProvisionApplicationGateway displayName: 'Provision Application Gateway Custom Domain' @@ -45,16 +48,34 @@ stages: GrafanaWorkspaceName: ${{ parameters.GrafanaWorkspaceName }} Location: 'westus2' GrafanaIdentityId: $(GrafanaIdentityId) + GrafanaKeyVault: ${{ parameters.GrafanaKeyVault }} - stage: PublishDashboards displayName: 'Publish Grafana Dashboards' dependsOn: ProvisionGrafana variables: GrafanaEndpoint: $[ stageDependencies.ProvisionGrafana.ProvisionGrafana.outputs['ExportGrafanaInfo.GrafanaEndpoint'] ] - KeyVaultName: $[ stageDependencies.ProvisionGrafana.ProvisionGrafana.outputs['ExportGrafanaInfo.KeyVaultName'] ] jobs: + - job: SetupToken + displayName: 'Setup Grafana API Token' + pool: + name: NetCore1ESPool-Internal + demands: ImageOverride -equals 1es-windows-2022 + steps: + - task: AzureCLI@2 + displayName: 'Create or Validate Grafana API Token' + inputs: + azureSubscription: ${{ parameters.ServiceConnectionName }} + scriptType: 'pscore' + scriptLocation: 'scriptPath' + scriptPath: 'eng/setup-grafana-api-token.ps1' + arguments: >- + -Environment "${{ parameters.DeploymentEnvironment }}" + -KeyVaultName "${{ parameters.GrafanaKeyVault }}" + - job: PublishDashboards displayName: 'Publish Dashboards to Azure Managed Grafana' + dependsOn: SetupToken pool: name: NetCore1ESPool-Internal demands: ImageOverride -equals 1es-windows-2022 @@ -78,7 +99,7 @@ stages: Write-Host "Publishing Dashboards to Azure Managed Grafana" Write-Host "==========================================" Write-Host "Grafana Endpoint: $(GrafanaEndpoint)" - Write-Host "Key Vault: $(KeyVaultName)" + Write-Host "Key Vault: ${{ parameters.GrafanaKeyVault }}" Write-Host "Environment: ${{ parameters.DeploymentEnvironment }}" Write-Host "" @@ -86,17 +107,12 @@ stages: $tokenSecretName = "grafana-admin-api-key" Write-Host "Retrieving API token from Key Vault..." - $apiToken = az keyvault secret show --vault-name "$(KeyVaultName)" --name $tokenSecretName --query "value" --output tsv + $apiToken = az keyvault secret show --vault-name "${{ parameters.GrafanaKeyVault }}" --name $tokenSecretName --query "value" --output tsv if (-not $apiToken) { - Write-Error "Failed to retrieve Grafana API token from Key Vault. Please ensure the secret '$tokenSecretName' exists in $(KeyVaultName)" - Write-Host "" - Write-Host "To create the token:" - Write-Host "1. Go to: $(GrafanaEndpoint)" - Write-Host "2. Navigate to: Administration > Service accounts" - Write-Host "3. Create a service account with Admin role" - Write-Host "4. Generate a token" - Write-Host "5. Store in Key Vault: az keyvault secret set --vault-name $(KeyVaultName) --name $tokenSecretName --value ''" + Write-Error "Failed to retrieve Grafana API token from Key Vault." + Write-Error "The token should have been created in the previous job (SetupToken)." + Write-Error "Please check the SetupToken job logs for errors." exit 1 } @@ -105,17 +121,13 @@ stages: Write-Host "Publishing dashboards using MSBuild SDK..." Write-Host "" - # Get service connection details for authentication - $servicePrincipalId = $env:servicePrincipalId - $servicePrincipalKey = $env:servicePrincipalKey - # Publish using the same MSBuild SDK as self-hosted Grafana dotnet build $(Build.SourcesDirectory)\src\Monitoring\Monitoring.ArcadeServices\Monitoring.ArcadeServices.proj ` --configuration Release ` -t:PublishGrafana ` -p:GrafanaAccessToken=$apiToken ` -p:GrafanaHost="$(GrafanaEndpoint)" ` - -p:GrafanaKeyVaultName="$(KeyVaultName)" ` + -p:GrafanaKeyVaultName="${{ parameters.GrafanaKeyVault }}" ` -p:GrafanaEnvironment="${{ parameters.DeploymentEnvironment }}" ` -p:ParametersFile=parameters.json ` -v:normal diff --git a/eng/deployment/azure-managed-grafana.bicep b/eng/deployment/azure-managed-grafana.bicep index 47a0074d8..55ca1ae32 100644 --- a/eng/deployment/azure-managed-grafana.bicep +++ b/eng/deployment/azure-managed-grafana.bicep @@ -29,6 +29,9 @@ param kvSkuFamily string = 'A' @description('The deployment environment (Staging or Production)') param environment string +@description('The name of the Key Vault for Grafana secrets') +param keyVaultName string + @description('The tenant ID for Azure AD') param tenantId string = tenant().tenantId @@ -48,7 +51,7 @@ resource grafanaUserAssignedIdentity 'Microsoft.ManagedIdentity/userAssignedIden // Azure Key Vault for Grafana secrets resource grafanaKeyVault 'Microsoft.KeyVault/vaults@2023-07-01' = { - name: environment == 'Production' ? 'dnceng-amg-prod-kv' : 'dnceng-amg-int-kv' + name: keyVaultName location: location tags: { Environment: environment diff --git a/eng/provision-appgw.yaml b/eng/provision-appgw.yaml index db2e115b4..471f77ed2 100644 --- a/eng/provision-appgw.yaml +++ b/eng/provision-appgw.yaml @@ -12,6 +12,8 @@ parameters: default: 'westus2' - name: GrafanaIdentityId type: string + - name: GrafanaKeyVault + type: string steps: - task: AzureCLI@2 @@ -65,7 +67,7 @@ steps: inlineScript: | Write-Host "Granting pipeline service principal Key Vault Certificates Officer role..." - $kvName = if ("${{ parameters.Environment }}" -eq "Production") { "dnceng-amg-prod-kv" } else { "dnceng-amg-int-kv" } + $kvName = "${{ parameters.GrafanaKeyVault }}" $rgName = "${{ parameters.ResourceGroupName }}" # Get the current service principal object ID @@ -115,13 +117,13 @@ steps: ${{ if eq(parameters.Environment, 'Staging') }}: arguments: >- -DnsName "dnceng-managed-grafana-staging.${{ parameters.Location }}.cloudapp.azure.com" - -KeyVaultName "dnceng-amg-int-kv" + -KeyVaultName "${{ parameters.GrafanaKeyVault }}" -ResourceGroupName "${{ parameters.ResourceGroupName }}" -Location "${{ parameters.Location }}" ${{ if eq(parameters.Environment, 'Production') }}: arguments: >- -DnsName "dnceng-managed-grafana.${{ parameters.Location }}.cloudapp.azure.com" - -KeyVaultName "dnceng-amg-prod-kv" + -KeyVaultName "${{ parameters.GrafanaKeyVault }}" -ResourceGroupName "${{ parameters.ResourceGroupName }}" -Location "${{ parameters.Location }}" diff --git a/eng/provision-grafana.yaml b/eng/provision-grafana.yaml index 47db29e9c..5143b6392 100644 --- a/eng/provision-grafana.yaml +++ b/eng/provision-grafana.yaml @@ -21,6 +21,9 @@ parameters: type: string default: 'Standard' +- name: GrafanaKeyVault + type: string + jobs: - job: ProvisionGrafana displayName: 'Provision Azure Managed Grafana' @@ -88,7 +91,7 @@ jobs: location: '${{ parameters.GrafanaLocation }}' templateLocation: 'Linked artifact' csmFile: 'eng/deployment/azure-managed-grafana.bicep' - overrideParameters: '-location "${{ parameters.GrafanaLocation }}" -grafanaWorkspaceName "${{ parameters.GrafanaWorkspaceName }}" -skuName "${{ parameters.GrafanaSkuName }}" -environment "${{ parameters.DeploymentEnvironment }}"' + overrideParameters: '-location "${{ parameters.GrafanaLocation }}" -grafanaWorkspaceName "${{ parameters.GrafanaWorkspaceName }}" -skuName "${{ parameters.GrafanaSkuName }}" -environment "${{ parameters.DeploymentEnvironment }}" -keyVaultName "${{ parameters.GrafanaKeyVault }}"' deploymentMode: 'Incremental' deploymentName: 'grafana-${{ parameters.DeploymentEnvironment }}-$(Build.BuildNumber)' deploymentOutputs: 'grafanaOutputs' @@ -207,7 +210,7 @@ jobs: } # Verify Key Vault - $kvName = if ("${{ parameters.DeploymentEnvironment }}" -eq "Production") { "dnceng-amg-prod-kv" } else { "dnceng-amg-int-kv" } + $kvName = "${{ parameters.GrafanaKeyVault }}" Write-Host "" Write-Host "KEY VAULT DETAILS:" $keyVault = az keyvault show --name $kvName --resource-group $rgName --query '{name:name, vaultUri:properties.vaultUri, sku:properties.sku.name}' -o json 2>$null | ConvertFrom-Json diff --git a/eng/setup-grafana-api-token.ps1 b/eng/setup-grafana-api-token.ps1 index 263fb7109..cfbd26546 100644 --- a/eng/setup-grafana-api-token.ps1 +++ b/eng/setup-grafana-api-token.ps1 @@ -9,10 +9,14 @@ The deployment environment (Staging or Production) .PARAMETER ApiToken The Grafana API token (if you already have one) +.PARAMETER KeyVaultName + The name of the Key Vault to store the token in (optional, defaults to environment-specific vault) .EXAMPLE .\setup-grafana-api-token.ps1 -Environment Staging .EXAMPLE .\setup-grafana-api-token.ps1 -Environment Production -ApiToken "glsa_xxx" +.EXAMPLE + .\setup-grafana-api-token.ps1 -Environment Staging -KeyVaultName "custom-keyvault" #> param( @@ -21,7 +25,10 @@ param( [string]$Environment, [Parameter(Mandatory=$false)] - [string]$ApiToken + [string]$ApiToken, + + [Parameter(Mandatory=$true)] + [string]$KeyVaultName ) Set-StrictMode -Version Latest @@ -30,7 +37,7 @@ $ErrorActionPreference = "Stop" # Determine workspace and Key Vault names $workspaceName = if ($Environment -eq "Production") { "dnceng-grafana" } else { "dnceng-grafana-staging" } $resourceGroup = "monitoring-managed" -$keyVaultName = if ($Environment -eq "Production") { "dnceng-amg-prod-kv" } else { "dnceng-amg-int-kv" } +$keyVaultName = $KeyVaultName $tokenSecretName = "grafana-admin-api-key" Write-Host "==========================================" From 0a79b461e0434c1410d1d91795d884789f07acb7 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Sat, 8 Nov 2025 22:04:26 -0800 Subject: [PATCH 069/133] grant service principal grafana admin role --- eng/deploy-managed-grafana.yml | 49 +++++++++++++++++++++++++++++++++ eng/setup-grafana-api-token.ps1 | 32 +++++++++++++++++++-- 2 files changed, 78 insertions(+), 3 deletions(-) diff --git a/eng/deploy-managed-grafana.yml b/eng/deploy-managed-grafana.yml index 6ecb2fcf6..f49011d6f 100644 --- a/eng/deploy-managed-grafana.yml +++ b/eng/deploy-managed-grafana.yml @@ -62,6 +62,55 @@ stages: name: NetCore1ESPool-Internal demands: ImageOverride -equals 1es-windows-2022 steps: + - task: AzureCLI@2 + displayName: 'Grant Pipeline Service Principal Grafana Admin Role' + inputs: + azureSubscription: ${{ parameters.ServiceConnectionName }} + scriptType: 'pscore' + scriptLocation: 'inlineScript' + inlineScript: | + Write-Host "Granting pipeline service principal Grafana Admin role..." + + $workspaceName = "${{ parameters.GrafanaWorkspaceName }}" + $rgName = "monitoring-managed" + + # Get the current service principal object ID + $spObjectId = az account show --query "user.name" --output tsv + Write-Host "Service Principal Object ID: $spObjectId" + + # Get the Grafana workspace resource ID + $grafanaId = az grafana show --name $workspaceName --resource-group $rgName --query "id" --output tsv + Write-Host "Grafana Workspace: $workspaceName" + Write-Host "Grafana ID: $grafanaId" + + # Check if role assignment already exists + $existingAssignment = az role assignment list ` + --assignee $spObjectId ` + --scope $grafanaId ` + --role "Grafana Admin" ` + --query "[0].id" ` + --output tsv + + if ($existingAssignment) { + Write-Host "āœ“ Pipeline service principal already has Grafana Admin role" + } else { + Write-Host "Granting Grafana Admin role..." + az role assignment create ` + --role "Grafana Admin" ` + --assignee $spObjectId ` + --scope $grafanaId ` + --output none + + if ($LASTEXITCODE -eq 0) { + Write-Host "āœ“ Pipeline service principal granted Grafana Admin role" + Write-Host "ā± Waiting 15 seconds for role assignment to propagate..." + Start-Sleep -Seconds 15 + } else { + Write-Error "Failed to grant Grafana Admin role" + exit 1 + } + } + - task: AzureCLI@2 displayName: 'Create or Validate Grafana API Token' inputs: diff --git a/eng/setup-grafana-api-token.ps1 b/eng/setup-grafana-api-token.ps1 index cfbd26546..4c7e876b5 100644 --- a/eng/setup-grafana-api-token.ps1 +++ b/eng/setup-grafana-api-token.ps1 @@ -130,6 +130,9 @@ if (-not $ApiToken) { # Create service account using Azure CLI Write-Host "Creating service account 'grafana-admin'..." + Write-Host "Workspace: $workspaceName" + Write-Host "Resource Group: $resourceGroup" + Write-Host "" $serviceAccountJson = az grafana service-account create ` --name $workspaceName ` @@ -140,27 +143,50 @@ if (-not $ApiToken) { if ($LASTEXITCODE -ne 0) { # Check if it already exists - if ($serviceAccountJson -like "*already exists*" -or $serviceAccountJson -like "*409*") { + if ($serviceAccountJson -like "*already exists*" -or $serviceAccountJson -like "*409*" -or $serviceAccountJson -like "*Conflict*") { Write-Host "⚠ Service account 'grafana-admin' already exists, retrieving it..." $listJson = az grafana service-account list ` --name $workspaceName ` --resource-group $resourceGroup ` - -o json + -o json 2>&1 + + if ($LASTEXITCODE -ne 0) { + Write-Error "Failed to list service accounts:" + Write-Host $listJson + Write-Host "" + Write-Host "This may be a permissions issue. Ensure the pipeline has access to the Grafana workspace." + exit 1 + } $serviceAccounts = $listJson | ConvertFrom-Json $serviceAccount = $serviceAccounts | Where-Object { $_.name -eq "grafana-admin" } | Select-Object -First 1 if (-not $serviceAccount) { Write-Error "Failed to find existing service account 'grafana-admin'" + Write-Host "Available service accounts:" + $serviceAccounts | ForEach-Object { Write-Host " - $($_.name) (ID: $($_.id))" } exit 1 } $serviceAccountId = $serviceAccount.id Write-Host "āœ“ Found existing service account with ID: $serviceAccountId" } else { - Write-Error "Failed to create service account:" + Write-Error "Failed to create service account. Details:" + Write-Host "" + Write-Host "Error output:" Write-Host $serviceAccountJson + Write-Host "" + Write-Host "Common causes:" + Write-Host " 1. Insufficient permissions - Pipeline needs Grafana Admin role" + Write-Host " 2. Grafana workspace not ready - Wait a few minutes and retry" + Write-Host " 3. Network connectivity issues" + Write-Host "" + Write-Host "To grant Grafana Admin role to the pipeline service principal:" + Write-Host " az role assignment create \" + Write-Host " --role 'Grafana Admin' \" + Write-Host " --assignee \" + Write-Host " --scope /subscriptions//resourceGroups/$resourceGroup/providers/Microsoft.Dashboard/grafana/$workspaceName" exit 1 } } else { From 7d95e70c51053ee8ca6127bd396bf654fb543cc1 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Sun, 9 Nov 2025 00:52:37 -0800 Subject: [PATCH 070/133] grant service principal key vault officer role --- eng/provision-appgw.yaml | 49 +++++++++++++++++++++++++++++++++ eng/setup-grafana-api-token.ps1 | 16 +++++------ 2 files changed, 57 insertions(+), 8 deletions(-) diff --git a/eng/provision-appgw.yaml b/eng/provision-appgw.yaml index 471f77ed2..4a214b906 100644 --- a/eng/provision-appgw.yaml +++ b/eng/provision-appgw.yaml @@ -107,6 +107,55 @@ steps: } } + - task: AzureCLI@2 + displayName: 'Grant Pipeline Service Principal Key Vault Secrets Access' + inputs: + azureSubscription: ${{ parameters.ServiceConnection }} + scriptType: 'pscore' + scriptLocation: 'inlineScript' + inlineScript: | + Write-Host "Granting pipeline service principal Key Vault Secrets Officer role..." + + $kvName = "${{ parameters.GrafanaKeyVault }}" + $rgName = "${{ parameters.ResourceGroupName }}" + + # Get the current service principal object ID + $spObjectId = az account show --query "user.name" --output tsv + Write-Host "Service Principal Object ID: $spObjectId" + + # Get the Key Vault resource ID + $kvId = az keyvault show --name $kvName --resource-group $rgName --query "id" --output tsv + Write-Host "Key Vault: $kvName" + Write-Host "Key Vault ID: $kvId" + + # Check if role assignment already exists + $existingAssignment = az role assignment list ` + --assignee $spObjectId ` + --scope $kvId ` + --role "Key Vault Secrets Officer" ` + --query "[0].id" ` + --output tsv + + if ($existingAssignment) { + Write-Host "āœ“ Pipeline service principal already has Key Vault Secrets Officer role" + } else { + Write-Host "Granting Key Vault Secrets Officer role..." + az role assignment create ` + --role "Key Vault Secrets Officer" ` + --assignee $spObjectId ` + --scope $kvId ` + --output none + + if ($LASTEXITCODE -eq 0) { + Write-Host "āœ“ Pipeline service principal granted Key Vault Secrets Officer role" + Write-Host "ā± Waiting 30 seconds for role assignment to propagate..." + Start-Sleep -Seconds 30 + } else { + Write-Error "Failed to grant Key Vault Secrets Officer role" + exit 1 + } + } + - task: AzureCLI@2 displayName: 'Setup Certificate in Key Vault' inputs: diff --git a/eng/setup-grafana-api-token.ps1 b/eng/setup-grafana-api-token.ps1 index 4c7e876b5..89565389f 100644 --- a/eng/setup-grafana-api-token.ps1 +++ b/eng/setup-grafana-api-token.ps1 @@ -10,13 +10,11 @@ .PARAMETER ApiToken The Grafana API token (if you already have one) .PARAMETER KeyVaultName - The name of the Key Vault to store the token in (optional, defaults to environment-specific vault) + The name of the Key Vault to store the token in .EXAMPLE - .\setup-grafana-api-token.ps1 -Environment Staging + .\setup-grafana-api-token.ps1 -Environment Staging -KeyVaultName "dnceng-amg-int-kv" .EXAMPLE - .\setup-grafana-api-token.ps1 -Environment Production -ApiToken "glsa_xxx" -.EXAMPLE - .\setup-grafana-api-token.ps1 -Environment Staging -KeyVaultName "custom-keyvault" + .\setup-grafana-api-token.ps1 -Environment Production -KeyVaultName "dnceng-amg-prod-kv" -ApiToken "glsa_xxx" #> param( @@ -239,6 +237,7 @@ if (-not $ApiToken.StartsWith("glsa_")) { # Store in Key Vault Write-Host "" Write-Host "Storing API token in Key Vault..." +Write-Host " Key Vault: $keyVaultName" try { az keyvault secret set ` @@ -251,10 +250,11 @@ try { } catch { Write-Error "Failed to store token in Key Vault: $_" Write-Host "" - Write-Host "Make sure you have the following permissions on the Key Vault:" - Write-Host "- Key Vault Secrets Officer (or Contributor)" + Write-Host "Make sure the pipeline service principal has the following permissions on the Key Vault:" + Write-Host "- Key Vault Secrets Officer (RBAC role)" Write-Host "" - Write-Host "You can grant yourself access with:" + Write-Host "This should be automatically granted during the ProvisionApplicationGateway stage." + Write-Host "If running manually, you can grant yourself access with:" Write-Host "az role assignment create --role 'Key Vault Secrets Officer' \" Write-Host " --assignee \" Write-Host " --scope /subscriptions//resourceGroups/$resourceGroup/providers/Microsoft.KeyVault/vaults/$keyVaultName" From 0e2b751eec041b18fdde0f856b6f11a05374f8e5 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Sun, 9 Nov 2025 11:55:21 -0800 Subject: [PATCH 071/133] Grant pipeline SP Key Vault Secrets Officer role in Grafana provisioning stage --- eng/provision-grafana.yaml | 49 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/eng/provision-grafana.yaml b/eng/provision-grafana.yaml index 5143b6392..10ed2392a 100644 --- a/eng/provision-grafana.yaml +++ b/eng/provision-grafana.yaml @@ -123,6 +123,55 @@ jobs: Write-Host "##vso[task.setvariable variable=GrafanaEndpoint;isOutput=true]$endpoint" Write-Host "##vso[task.setvariable variable=KeyVaultName;isOutput=true]$keyVaultName" + - task: AzureCLI@2 + displayName: 'Grant Pipeline Service Principal Key Vault Secrets Access' + inputs: + azureSubscription: '${{ parameters.ServiceConnectionName }}' + scriptType: 'pscore' + scriptLocation: 'inlineScript' + inlineScript: | + Write-Host "Granting pipeline service principal Key Vault Secrets Officer role..." + + $kvName = "${{ parameters.GrafanaKeyVault }}" + $rgName = "${{ parameters.GrafanaResourceGroup }}" + + # Get the current service principal object ID + $spObjectId = az account show --query "user.name" --output tsv + Write-Host "Service Principal Object ID: $spObjectId" + + # Get the Key Vault resource ID + $kvId = az keyvault show --name $kvName --resource-group $rgName --query "id" --output tsv + Write-Host "Key Vault: $kvName" + Write-Host "Key Vault ID: $kvId" + + # Check if role assignment already exists + $existingAssignment = az role assignment list ` + --assignee $spObjectId ` + --scope $kvId ` + --role "Key Vault Secrets Officer" ` + --query "[0].id" ` + --output tsv + + if ($existingAssignment) { + Write-Host "āœ“ Pipeline service principal already has Key Vault Secrets Officer role" + } else { + Write-Host "Granting Key Vault Secrets Officer role..." + az role assignment create ` + --role "Key Vault Secrets Officer" ` + --assignee $spObjectId ` + --scope $kvId ` + --output none + + if ($LASTEXITCODE -eq 0) { + Write-Host "āœ“ Pipeline service principal granted Key Vault Secrets Officer role" + Write-Host "ā± Waiting 30 seconds for role assignment to propagate..." + Start-Sleep -Seconds 30 + } else { + Write-Error "Failed to grant Key Vault Secrets Officer role" + exit 1 + } + } + - task: AzureCLI@2 displayName: 'Install Azure Managed Grafana Extension' inputs: From 25be23eaf25d9074e6ca5fa4e5f55adcb785c96b Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Sun, 9 Nov 2025 17:22:33 -0800 Subject: [PATCH 072/133] Add Key Vault permission verification and retry logic for RBAC propagation --- eng/deploy-managed-grafana.yml | 29 +++++++++++++++++++++++++++++ eng/provision-grafana.yaml | 32 ++++++++++++++++++++++++++++++-- 2 files changed, 59 insertions(+), 2 deletions(-) diff --git a/eng/deploy-managed-grafana.yml b/eng/deploy-managed-grafana.yml index f49011d6f..467edfb86 100644 --- a/eng/deploy-managed-grafana.yml +++ b/eng/deploy-managed-grafana.yml @@ -152,6 +152,35 @@ stages: Write-Host "Environment: ${{ parameters.DeploymentEnvironment }}" Write-Host "" + # Verify Key Vault access before attempting to read the token + Write-Host "Verifying Key Vault access..." + $canAccess = $false + $maxRetries = 5 + $retryCount = 0 + + while (-not $canAccess -and $retryCount -lt $maxRetries) { + try { + az keyvault secret list --vault-name "${{ parameters.GrafanaKeyVault }}" --query "[0].name" --output tsv 2>&1 | Out-Null + if ($LASTEXITCODE -eq 0) { + $canAccess = $true + Write-Host "āœ“ Key Vault access verified" + } else { + throw "Access denied" + } + } catch { + $retryCount++ + if ($retryCount -lt $maxRetries) { + Write-Host "ā± Waiting for Key Vault permissions to propagate (attempt $retryCount/$maxRetries)..." + Start-Sleep -Seconds 30 + } else { + Write-Error "Unable to access Key Vault after $maxRetries attempts" + Write-Error "The pipeline service principal may not have the 'Key Vault Secrets Officer' role." + Write-Error "This should be automatically granted in the ProvisionGrafana stage." + exit 1 + } + } + } + # Get the API token from Key Vault $tokenSecretName = "grafana-admin-api-key" Write-Host "Retrieving API token from Key Vault..." diff --git a/eng/provision-grafana.yaml b/eng/provision-grafana.yaml index 10ed2392a..083639f8b 100644 --- a/eng/provision-grafana.yaml +++ b/eng/provision-grafana.yaml @@ -164,13 +164,41 @@ jobs: if ($LASTEXITCODE -eq 0) { Write-Host "āœ“ Pipeline service principal granted Key Vault Secrets Officer role" - Write-Host "ā± Waiting 30 seconds for role assignment to propagate..." - Start-Sleep -Seconds 30 + Write-Host "ā± Waiting 60 seconds for role assignment to propagate..." + Start-Sleep -Seconds 60 } else { Write-Error "Failed to grant Key Vault Secrets Officer role" exit 1 } } + + # Verify the permission by attempting to list secrets + Write-Host "" + Write-Host "Verifying Key Vault access..." + $canAccess = $false + $maxRetries = 3 + $retryCount = 0 + + while (-not $canAccess -and $retryCount -lt $maxRetries) { + try { + az keyvault secret list --vault-name $kvName --query "[0].name" --output tsv 2>&1 | Out-Null + if ($LASTEXITCODE -eq 0) { + $canAccess = $true + Write-Host "āœ“ Key Vault access verified successfully" + } else { + throw "Access denied" + } + } catch { + $retryCount++ + if ($retryCount -lt $maxRetries) { + Write-Host "ā± Waiting for permissions to propagate (attempt $retryCount/$maxRetries)..." + Start-Sleep -Seconds 30 + } else { + Write-Warning "Unable to verify Key Vault access after $maxRetries attempts" + Write-Host "Continuing anyway - permissions may still propagate..." + } + } + } - task: AzureCLI@2 displayName: 'Install Azure Managed Grafana Extension' From ab8601e6193da7c11887ae00fec26620ed45da28 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Sun, 9 Nov 2025 21:09:55 -0800 Subject: [PATCH 073/133] Pass Azure Pipelines credentials to MSBuild SDK for Key Vault authentication --- eng/deploy-managed-grafana.yml | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/eng/deploy-managed-grafana.yml b/eng/deploy-managed-grafana.yml index 467edfb86..14578d40b 100644 --- a/eng/deploy-managed-grafana.yml +++ b/eng/deploy-managed-grafana.yml @@ -128,6 +128,10 @@ stages: pool: name: NetCore1ESPool-Internal demands: ImageOverride -equals 1es-windows-2022 + variables: + # Allow scripts to access the System.AccessToken for Azure authentication + - name: System.AccessToken + value: $(System.AccessToken) steps: - task: UseDotNet@2 displayName: 'Install Correct .NET Version' @@ -143,6 +147,7 @@ stages: azureSubscription: ${{ parameters.ServiceConnectionName }} scriptType: 'pscore' scriptLocation: 'inlineScript' + addSpnToEnvironment: true inlineScript: | Write-Host "==========================================" Write-Host "Publishing Dashboards to Azure Managed Grafana" @@ -152,6 +157,11 @@ stages: Write-Host "Environment: ${{ parameters.DeploymentEnvironment }}" Write-Host "" + # Get service principal client ID from environment variable set by Azure CLI task + $clientId = $env:servicePrincipalId + Write-Host "Service Principal Client ID: $clientId" + Write-Host "" + # Verify Key Vault access before attempting to read the token Write-Host "Verifying Key Vault access..." $canAccess = $false @@ -199,6 +209,14 @@ stages: Write-Host "Publishing dashboards using MSBuild SDK..." Write-Host "" + # Get Azure DevOps service connection details for Key Vault authentication + $serviceConnectionId = "${{ parameters.ServiceConnectionName }}" + $systemAccessToken = "$(System.AccessToken)" + + # The ClientId is the service principal application (client) ID + # This comes from the service connection + Write-Host "Service Connection: $serviceConnectionId" + # Publish using the same MSBuild SDK as self-hosted Grafana dotnet build $(Build.SourcesDirectory)\src\Monitoring\Monitoring.ArcadeServices\Monitoring.ArcadeServices.proj ` --configuration Release ` @@ -208,6 +226,9 @@ stages: -p:GrafanaKeyVaultName="${{ parameters.GrafanaKeyVault }}" ` -p:GrafanaEnvironment="${{ parameters.DeploymentEnvironment }}" ` -p:ParametersFile=parameters.json ` + -p:ClientId="$clientId" ` + -p:ServiceConnectionId="${{ parameters.ServiceConnectionName }}" ` + -p:SystemAccessToken="$(System.AccessToken)" ` -v:normal if ($LASTEXITCODE -ne 0) { From d40d857c1526bb065c8fb50b67e4bab8df457b88 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Mon, 10 Nov 2025 20:50:27 -0800 Subject: [PATCH 074/133] add service connection ID --- azure-pipelines-managed-grafana.yml | 1 + eng/deploy-managed-grafana.yml | 11 +++++++---- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/azure-pipelines-managed-grafana.yml b/azure-pipelines-managed-grafana.yml index c5fbb60d4..b2b555cc8 100644 --- a/azure-pipelines-managed-grafana.yml +++ b/azure-pipelines-managed-grafana.yml @@ -30,6 +30,7 @@ extends: - template: /eng/deploy-managed-grafana.yml@self parameters: ServiceConnectionName: 'Dotnet Engineering services' + ServiceConnectionId: dd8c2cfc-b9c9-452c-a168-ccd4240ada55 ${{ if ne(variables['Build.SourceBranch'], 'refs/heads/production') }}: DeploymentEnvironment: Staging GrafanaWorkspaceName: dnceng-grafana-staging diff --git a/eng/deploy-managed-grafana.yml b/eng/deploy-managed-grafana.yml index 14578d40b..fdf6680c8 100644 --- a/eng/deploy-managed-grafana.yml +++ b/eng/deploy-managed-grafana.yml @@ -1,6 +1,8 @@ parameters: - name: ServiceConnectionName type: string +- name: ServiceConnectionId + type: string - name: DeploymentEnvironment type: string - name: GrafanaWorkspaceName @@ -210,12 +212,13 @@ stages: Write-Host "" # Get Azure DevOps service connection details for Key Vault authentication - $serviceConnectionId = "${{ parameters.ServiceConnectionName }}" + $serviceConnectionId = "${{ parameters.ServiceConnectionId }}" $systemAccessToken = "$(System.AccessToken)" # The ClientId is the service principal application (client) ID # This comes from the service connection - Write-Host "Service Connection: $serviceConnectionId" + Write-Host "Service Connection: ${{ parameters.ServiceConnectionName }}" + Write-Host "Service Connection ID: $serviceConnectionId" # Publish using the same MSBuild SDK as self-hosted Grafana dotnet build $(Build.SourcesDirectory)\src\Monitoring\Monitoring.ArcadeServices\Monitoring.ArcadeServices.proj ` @@ -227,8 +230,8 @@ stages: -p:GrafanaEnvironment="${{ parameters.DeploymentEnvironment }}" ` -p:ParametersFile=parameters.json ` -p:ClientId="$clientId" ` - -p:ServiceConnectionId="${{ parameters.ServiceConnectionName }}" ` - -p:SystemAccessToken="$(System.AccessToken)" ` + -p:ServiceConnectionId="$serviceConnectionId" ` + -p:SystemAccessToken="$systemAccessToken" ` -v:normal if ($LASTEXITCODE -ne 0) { From ca72eb5579fa238bfe3d2b29098d2880f3a0e46a Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Mon, 10 Nov 2025 22:16:38 -0800 Subject: [PATCH 075/133] add service connection client ID --- azure-pipelines-managed-grafana.yml | 1 + eng/deploy-managed-grafana.yml | 17 +++++------------ 2 files changed, 6 insertions(+), 12 deletions(-) diff --git a/azure-pipelines-managed-grafana.yml b/azure-pipelines-managed-grafana.yml index b2b555cc8..6b93400cb 100644 --- a/azure-pipelines-managed-grafana.yml +++ b/azure-pipelines-managed-grafana.yml @@ -31,6 +31,7 @@ extends: parameters: ServiceConnectionName: 'Dotnet Engineering services' ServiceConnectionId: dd8c2cfc-b9c9-452c-a168-ccd4240ada55 + ServiceConnectionClientId: fc1eb341-aea4-4a11-8f80-d14b8775b2ba ${{ if ne(variables['Build.SourceBranch'], 'refs/heads/production') }}: DeploymentEnvironment: Staging GrafanaWorkspaceName: dnceng-grafana-staging diff --git a/eng/deploy-managed-grafana.yml b/eng/deploy-managed-grafana.yml index fdf6680c8..a098fb874 100644 --- a/eng/deploy-managed-grafana.yml +++ b/eng/deploy-managed-grafana.yml @@ -1,6 +1,8 @@ parameters: - name: ServiceConnectionName type: string +- name: ServiceConnectionClientId + type: string - name: ServiceConnectionId type: string - name: DeploymentEnvironment @@ -159,11 +161,6 @@ stages: Write-Host "Environment: ${{ parameters.DeploymentEnvironment }}" Write-Host "" - # Get service principal client ID from environment variable set by Azure CLI task - $clientId = $env:servicePrincipalId - Write-Host "Service Principal Client ID: $clientId" - Write-Host "" - # Verify Key Vault access before attempting to read the token Write-Host "Verifying Key Vault access..." $canAccess = $false @@ -211,10 +208,6 @@ stages: Write-Host "Publishing dashboards using MSBuild SDK..." Write-Host "" - # Get Azure DevOps service connection details for Key Vault authentication - $serviceConnectionId = "${{ parameters.ServiceConnectionId }}" - $systemAccessToken = "$(System.AccessToken)" - # The ClientId is the service principal application (client) ID # This comes from the service connection Write-Host "Service Connection: ${{ parameters.ServiceConnectionName }}" @@ -229,9 +222,9 @@ stages: -p:GrafanaKeyVaultName="${{ parameters.GrafanaKeyVault }}" ` -p:GrafanaEnvironment="${{ parameters.DeploymentEnvironment }}" ` -p:ParametersFile=parameters.json ` - -p:ClientId="$clientId" ` - -p:ServiceConnectionId="$serviceConnectionId" ` - -p:SystemAccessToken="$systemAccessToken" ` + -p:ClientId="${{ parameters.ServiceConnectionClientId }}" ` + -p:ServiceConnectionId="${{ parameters.ServiceConnectionId }}" ` + -p:SystemAccessToken="$(System.AccessToken)" ` -v:normal if ($LASTEXITCODE -ne 0) { From 4b97270e953c296b8e9bc588a693fe2f4acf399f Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Mon, 10 Nov 2025 22:28:37 -0800 Subject: [PATCH 076/133] remove undefined ServiceConnectionId --- eng/deploy-managed-grafana.yml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/eng/deploy-managed-grafana.yml b/eng/deploy-managed-grafana.yml index a098fb874..3d0f90294 100644 --- a/eng/deploy-managed-grafana.yml +++ b/eng/deploy-managed-grafana.yml @@ -208,11 +208,6 @@ stages: Write-Host "Publishing dashboards using MSBuild SDK..." Write-Host "" - # The ClientId is the service principal application (client) ID - # This comes from the service connection - Write-Host "Service Connection: ${{ parameters.ServiceConnectionName }}" - Write-Host "Service Connection ID: $serviceConnectionId" - # Publish using the same MSBuild SDK as self-hosted Grafana dotnet build $(Build.SourcesDirectory)\src\Monitoring\Monitoring.ArcadeServices\Monitoring.ArcadeServices.proj ` --configuration Release ` From 7f3fda82c8ab1135c7770f90c449897fa3ce4de2 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Tue, 11 Nov 2025 08:50:33 -0800 Subject: [PATCH 077/133] import secrets from dotnet-grafana-secrets.yaml --- .vault-config/dngeng-amg-int-kv.yaml | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 .vault-config/dngeng-amg-int-kv.yaml diff --git a/.vault-config/dngeng-amg-int-kv.yaml b/.vault-config/dngeng-amg-int-kv.yaml new file mode 100644 index 000000000..794e24c44 --- /dev/null +++ b/.vault-config/dngeng-amg-int-kv.yaml @@ -0,0 +1,7 @@ +storageLocation: + type: azure-key-vault + parameters: + subscription: a4fc5514-21a9-4296-bfaf-5c7ee7fa35d1 + name: dnceng-amg-int-kv + +importSecretsFrom: shared/dotnet-grafana-secrets.yaml From 9f6d58d810623f4d9240600d539c80a357377fb8 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Tue, 11 Nov 2025 10:05:23 -0800 Subject: [PATCH 078/133] remove dnceng-amg-int-kv.yaml file --- .vault-config/dngeng-amg-int-kv.yaml | 7 ------- 1 file changed, 7 deletions(-) delete mode 100644 .vault-config/dngeng-amg-int-kv.yaml diff --git a/.vault-config/dngeng-amg-int-kv.yaml b/.vault-config/dngeng-amg-int-kv.yaml deleted file mode 100644 index 794e24c44..000000000 --- a/.vault-config/dngeng-amg-int-kv.yaml +++ /dev/null @@ -1,7 +0,0 @@ -storageLocation: - type: azure-key-vault - parameters: - subscription: a4fc5514-21a9-4296-bfaf-5c7ee7fa35d1 - name: dnceng-amg-int-kv - -importSecretsFrom: shared/dotnet-grafana-secrets.yaml From 81f4ed58d7a3c180780e53575c218eeac4abd279 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Tue, 11 Nov 2025 12:03:59 -0800 Subject: [PATCH 079/133] remove notification alerts and synchronize secrets --- .vault-config/dnceng-amg-int-kv.yaml | 26 +++++++++++++++++++++++++ src/Monitoring/Sdk/DeployPublisher.cs | 5 ++++- src/Monitoring/Sdk/MonitoringPublish.cs | 7 ++++++- 3 files changed, 36 insertions(+), 2 deletions(-) create mode 100644 .vault-config/dnceng-amg-int-kv.yaml diff --git a/.vault-config/dnceng-amg-int-kv.yaml b/.vault-config/dnceng-amg-int-kv.yaml new file mode 100644 index 000000000..68a0c2697 --- /dev/null +++ b/.vault-config/dnceng-amg-int-kv.yaml @@ -0,0 +1,26 @@ +storageLocation: + type: azure-key-vault + parameters: + subscription: a4fc5514-21a9-4296-bfaf-5c7ee7fa35d1 + name: dnceng-amg-int-kv + +secrets: + # Copy only the secrets needed for Azure Managed Grafana datasources and notifications + + # API token for DotNet Status website + dotnet-build-bot-dotnet-eng-status-token: + type: text + parameters: + description: API token from https://dotneteng-status{-staging}.azurewebsites.net/ - Generated using dotnet-build-bot account + + # Authorization header for Deployment Annotations datasource + dotneteng-status-auth-header: + type: text + parameters: + description: "Bearer token for status API - Format: Bearer " + + # Teams webhook URL for alert notifications + fr-bot-notifications-teams-notification-url: + type: text + parameters: + description: Teams Incoming Webhook URL - Do not rotate diff --git a/src/Monitoring/Sdk/DeployPublisher.cs b/src/Monitoring/Sdk/DeployPublisher.cs index 854da03f2..5a7e51583 100644 --- a/src/Monitoring/Sdk/DeployPublisher.cs +++ b/src/Monitoring/Sdk/DeployPublisher.cs @@ -62,7 +62,10 @@ public async Task PostToGrafanaAsync() { await PostDatasourcesAsync().ConfigureAwait(false); - await PostNotificationsAsync().ConfigureAwait(false); + // Skip notifications for Azure Managed Grafana - it uses Grafana Unified Alerting + // which has different APIs (/api/v1/provisioning/contact-points) + // TODO: Implement unified alerting support + // await PostNotificationsAsync().ConfigureAwait(false); await PostDashboardsAsync().ConfigureAwait(false); } diff --git a/src/Monitoring/Sdk/MonitoringPublish.cs b/src/Monitoring/Sdk/MonitoringPublish.cs index 8fee63594..46e7d762e 100644 --- a/src/Monitoring/Sdk/MonitoringPublish.cs +++ b/src/Monitoring/Sdk/MonitoringPublish.cs @@ -114,7 +114,12 @@ private async Task ExecuteAsync() } catch (HttpRequestException e) { - Log.LogErrorFromException(e, showStackTrace: false, showDetail: false, file: "MonitoringPublish"); + Log.LogErrorFromException(e, showStackTrace: true, showDetail: true, file: "MonitoringPublish"); + return false; + } + catch (System.Exception e) + { + Log.LogErrorFromException(e, showStackTrace: true, showDetail: true, file: "MonitoringPublish"); return false; } } From 381b93660d5ee9757eb2a79d0d73f049346c6264 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Tue, 11 Nov 2025 14:22:04 -0800 Subject: [PATCH 080/133] fix key vault access propagation --- eng/deploy-managed-grafana.yml | 8 +++++--- eng/provision-grafana.yaml | 30 ++---------------------------- 2 files changed, 7 insertions(+), 31 deletions(-) diff --git a/eng/deploy-managed-grafana.yml b/eng/deploy-managed-grafana.yml index 3d0f90294..87e409a5f 100644 --- a/eng/deploy-managed-grafana.yml +++ b/eng/deploy-managed-grafana.yml @@ -163,9 +163,11 @@ stages: # Verify Key Vault access before attempting to read the token Write-Host "Verifying Key Vault access..." + Write-Host "ā„¹ļø Note: Azure RBAC can take 5-10 minutes to propagate to Key Vault data plane" $canAccess = $false $maxRetries = 5 $retryCount = 0 + $waitSeconds = 45 while (-not $canAccess -and $retryCount -lt $maxRetries) { try { @@ -179,10 +181,10 @@ stages: } catch { $retryCount++ if ($retryCount -lt $maxRetries) { - Write-Host "ā± Waiting for Key Vault permissions to propagate (attempt $retryCount/$maxRetries)..." - Start-Sleep -Seconds 30 + Write-Host "ā± Waiting for Key Vault permissions to propagate (attempt $retryCount/$maxRetries, waiting $waitSeconds seconds)..." + Start-Sleep -Seconds $waitSeconds } else { - Write-Error "Unable to access Key Vault after $maxRetries attempts" + Write-Error "Unable to access Key Vault after $maxRetries attempts ($($maxRetries * $waitSeconds) seconds total)" Write-Error "The pipeline service principal may not have the 'Key Vault Secrets Officer' role." Write-Error "This should be automatically granted in the ProvisionGrafana stage." exit 1 diff --git a/eng/provision-grafana.yaml b/eng/provision-grafana.yaml index 083639f8b..1d6f63197 100644 --- a/eng/provision-grafana.yaml +++ b/eng/provision-grafana.yaml @@ -164,41 +164,15 @@ jobs: if ($LASTEXITCODE -eq 0) { Write-Host "āœ“ Pipeline service principal granted Key Vault Secrets Officer role" - Write-Host "ā± Waiting 60 seconds for role assignment to propagate..." - Start-Sleep -Seconds 60 } else { Write-Error "Failed to grant Key Vault Secrets Officer role" exit 1 } } - # Verify the permission by attempting to list secrets Write-Host "" - Write-Host "Verifying Key Vault access..." - $canAccess = $false - $maxRetries = 3 - $retryCount = 0 - - while (-not $canAccess -and $retryCount -lt $maxRetries) { - try { - az keyvault secret list --vault-name $kvName --query "[0].name" --output tsv 2>&1 | Out-Null - if ($LASTEXITCODE -eq 0) { - $canAccess = $true - Write-Host "āœ“ Key Vault access verified successfully" - } else { - throw "Access denied" - } - } catch { - $retryCount++ - if ($retryCount -lt $maxRetries) { - Write-Host "ā± Waiting for permissions to propagate (attempt $retryCount/$maxRetries)..." - Start-Sleep -Seconds 30 - } else { - Write-Warning "Unable to verify Key Vault access after $maxRetries attempts" - Write-Host "Continuing anyway - permissions may still propagate..." - } - } - } + Write-Host "ā„¹ļø Note: Azure RBAC permissions can take 5-10 minutes to propagate to Key Vault data plane" + Write-Host "ā„¹ļø The PublishDashboards stage has retry logic to handle propagation delays" - task: AzureCLI@2 displayName: 'Install Azure Managed Grafana Extension' From c617b3a0117e8ccbcd6932e0bc507d716968fb8b Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Tue, 11 Nov 2025 15:06:52 -0800 Subject: [PATCH 081/133] fix keyvault secret access --- eng/deploy-managed-grafana.yml | 57 +++++++++++++++++----------------- 1 file changed, 28 insertions(+), 29 deletions(-) diff --git a/eng/deploy-managed-grafana.yml b/eng/deploy-managed-grafana.yml index 87e409a5f..44ef6e827 100644 --- a/eng/deploy-managed-grafana.yml +++ b/eng/deploy-managed-grafana.yml @@ -161,51 +161,50 @@ stages: Write-Host "Environment: ${{ parameters.DeploymentEnvironment }}" Write-Host "" - # Verify Key Vault access before attempting to read the token - Write-Host "Verifying Key Vault access..." + # Get the API token from Key Vault with retry logic for RBAC propagation + $tokenSecretName = "grafana-admin-api-key" + Write-Host "Retrieving API token from Key Vault..." Write-Host "ā„¹ļø Note: Azure RBAC can take 5-10 minutes to propagate to Key Vault data plane" - $canAccess = $false - $maxRetries = 5 + + $apiToken = $null + $maxRetries = 12 $retryCount = 0 - $waitSeconds = 45 + $waitSeconds = 30 - while (-not $canAccess -and $retryCount -lt $maxRetries) { + while (-not $apiToken -and $retryCount -lt $maxRetries) { try { - az keyvault secret list --vault-name "${{ parameters.GrafanaKeyVault }}" --query "[0].name" --output tsv 2>&1 | Out-Null - if ($LASTEXITCODE -eq 0) { - $canAccess = $true - Write-Host "āœ“ Key Vault access verified" + $apiToken = az keyvault secret show --vault-name "${{ parameters.GrafanaKeyVault }}" --name $tokenSecretName --query "value" --output tsv 2>&1 + + if ($LASTEXITCODE -eq 0 -and $apiToken -and $apiToken.Trim()) { + Write-Host "āœ“ API token retrieved successfully from Key Vault" + break } else { - throw "Access denied" + $apiToken = $null + throw "Failed to retrieve token" } } catch { $retryCount++ if ($retryCount -lt $maxRetries) { - Write-Host "ā± Waiting for Key Vault permissions to propagate (attempt $retryCount/$maxRetries, waiting $waitSeconds seconds)..." + Write-Host "ā± Waiting for Key Vault access (attempt $retryCount/$maxRetries, waiting $waitSeconds seconds)..." Start-Sleep -Seconds $waitSeconds } else { - Write-Error "Unable to access Key Vault after $maxRetries attempts ($($maxRetries * $waitSeconds) seconds total)" - Write-Error "The pipeline service principal may not have the 'Key Vault Secrets Officer' role." - Write-Error "This should be automatically granted in the ProvisionGrafana stage." + Write-Error "Unable to retrieve API token after $maxRetries attempts ($($maxRetries * $waitSeconds) seconds total)" + Write-Error "Secret name: $tokenSecretName" + Write-Error "Key Vault: ${{ parameters.GrafanaKeyVault }}" + Write-Error "" + Write-Error "Possible causes:" + Write-Error "1. RBAC permissions haven't propagated yet (can take 5-10 minutes)" + Write-Error "2. The SetupToken job failed to create the token" + Write-Error "3. The pipeline service principal doesn't have Key Vault Secrets Officer role" + Write-Error "" + Write-Error "Please check:" + Write-Error "- SetupToken job logs for errors" + Write-Error "- ProvisionGrafana stage logs for RBAC assignment status" exit 1 } } } - # Get the API token from Key Vault - $tokenSecretName = "grafana-admin-api-key" - Write-Host "Retrieving API token from Key Vault..." - - $apiToken = az keyvault secret show --vault-name "${{ parameters.GrafanaKeyVault }}" --name $tokenSecretName --query "value" --output tsv - - if (-not $apiToken) { - Write-Error "Failed to retrieve Grafana API token from Key Vault." - Write-Error "The token should have been created in the previous job (SetupToken)." - Write-Error "Please check the SetupToken job logs for errors." - exit 1 - } - - Write-Host "āœ“ API token retrieved successfully" Write-Host "" Write-Host "Publishing dashboards using MSBuild SDK..." Write-Host "" From 6f68e64b2de53bb251d153fa4f8919d28649fac7 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Tue, 11 Nov 2025 18:43:50 -0800 Subject: [PATCH 082/133] grant MI monitoring reader permission to subscriptions --- eng/deployment/azure-managed-grafana.bicep | 62 +++++++++++++++++++ .../grafana-monitoring-reader.bicep | 29 +++++++++ 2 files changed, 91 insertions(+) create mode 100644 eng/deployment/grafana-monitoring-reader.bicep diff --git a/eng/deployment/azure-managed-grafana.bicep b/eng/deployment/azure-managed-grafana.bicep index 55ca1ae32..424dbd91f 100644 --- a/eng/deployment/azure-managed-grafana.bicep +++ b/eng/deployment/azure-managed-grafana.bicep @@ -90,6 +90,56 @@ var keyVaultSecretsUserRoleId = '4633458b-17de-408a-b874-0445c86b69e6' // Define Grafana Admin role ID var grafanaAdminRoleId = '22926164-76b3-42b3-bc55-97df8dab3e41' +// Subscription IDs for Azure Monitor access +var stagingSubscriptions = [ + { + name: 'DotNetProductConstructionServicesStaging' + id: 'e6b5f9f5-0ca4-4351-879b-014d78400ec2' + } + { + name: 'HelixStaging' + id: 'cab65fc3-d077-467d-931f-3932eabf36d3' + } + { + name: 'DncEngInternalTooling' + id: '84a65c9a-787d-45da-b10a-3a1cefce8060' + } + { + name: 'DotnetEngineeringServices' + id: 'a4fc5514-21a9-4296-bfaf-5c7ee7fa35d1' + } + { + name: 'Helix' + id: '68672ab8-de0c-40f1-8d1b-ffb20bd62c0f' + } +] + +var productionSubscriptions = [ + { + name: 'DotNetProductConstructionServices' + id: 'fbd6122a-9ad3-42e4-976e-bccb82486856' + } + { + name: 'HelixStaging' + id: 'cab65fc3-d077-467d-931f-3932eabf36d3' + } + { + name: 'DncEngInternalTooling' + id: '84a65c9a-787d-45da-b10a-3a1cefce8060' + } + { + name: 'DotnetEngineeringServices' + id: 'a4fc5514-21a9-4296-bfaf-5c7ee7fa35d1' + } + { + name: 'Helix' + id: '68672ab8-de0c-40f1-8d1b-ffb20bd62c0f' + } +] + +// Select subscription list based on environment +var monitoringSubscriptions = environment == 'Production' ? productionSubscriptions : stagingSubscriptions + resource grafanaKeyVaultSecretsOfficerRole 'Microsoft.Authorization/roleAssignments@2022-04-01' = { name: guid(grafanaKeyVault.id, grafanaUserAssignedIdentity.id, keyVaultSecretsOfficerRoleId) scope: grafanaKeyVault @@ -180,6 +230,18 @@ resource grafanaWorkspace 'Microsoft.Dashboard/grafana@2023-09-01' = { } } +// Grant Monitoring Reader role to Grafana managed identity on multiple subscriptions +// This allows Azure Monitor datasources to query metrics and logs +module grafanaMonitoringReaderRoles 'grafana-monitoring-reader.bicep' = [for sub in monitoringSubscriptions: { + name: 'monitoringReader-${sub.name}-${environment}' + scope: subscription(sub.id) + params: { + grafanaPrincipalId: grafanaUserAssignedIdentity.properties.principalId + environment: environment + identityResourceId: grafanaUserAssignedIdentity.id + } +}] + // Grant Grafana Admin role to .NET Engineering Services group resource dotnetEngServicesGrafanaAdminRole 'Microsoft.Authorization/roleAssignments@2022-04-01' = { name: guid(grafanaWorkspace.id, dotnetEngServicesGroupId, grafanaAdminRoleId) diff --git a/eng/deployment/grafana-monitoring-reader.bicep b/eng/deployment/grafana-monitoring-reader.bicep new file mode 100644 index 000000000..e6a81befc --- /dev/null +++ b/eng/deployment/grafana-monitoring-reader.bicep @@ -0,0 +1,29 @@ +// Module to grant Monitoring Reader role at subscription scope +// This must be deployed at subscription scope, so it's a separate module + +targetScope = 'subscription' + +@description('The principal ID of the Grafana managed identity') +param grafanaPrincipalId string + +@description('The deployment environment (for unique naming)') +param environment string + +@description('The managed identity resource ID (for unique naming)') +param identityResourceId string + +// Monitoring Reader role ID +var monitoringReaderRoleId = '43d0d8ad-25c7-4714-9337-8ba259a9fe05' + +// Grant Monitoring Reader role to Grafana managed identity +resource monitoringReaderRole 'Microsoft.Authorization/roleAssignments@2022-04-01' = { + name: guid(subscription().id, identityResourceId, monitoringReaderRoleId, environment) + properties: { + roleDefinitionId: subscriptionResourceId('Microsoft.Authorization/roleDefinitions', monitoringReaderRoleId) + principalId: grafanaPrincipalId + principalType: 'ServicePrincipal' + description: 'Grants Grafana managed identity read access to Azure Monitor resources for ${environment} environment' + } +} + +output roleAssignmentId string = monitoringReaderRole.id From 5ce1be5d1a66e7d85205ea5f0d0bfb0b096e0c43 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Tue, 11 Nov 2025 21:32:10 -0800 Subject: [PATCH 083/133] grant MI monitoring reader permission to subscriptions in the pipeline --- eng/deployment/azure-managed-grafana.bicep | 62 ---------- .../grafana-monitoring-reader.bicep | 29 ----- eng/provision-grafana.yaml | 109 ++++++++++++++++++ 3 files changed, 109 insertions(+), 91 deletions(-) delete mode 100644 eng/deployment/grafana-monitoring-reader.bicep diff --git a/eng/deployment/azure-managed-grafana.bicep b/eng/deployment/azure-managed-grafana.bicep index 424dbd91f..55ca1ae32 100644 --- a/eng/deployment/azure-managed-grafana.bicep +++ b/eng/deployment/azure-managed-grafana.bicep @@ -90,56 +90,6 @@ var keyVaultSecretsUserRoleId = '4633458b-17de-408a-b874-0445c86b69e6' // Define Grafana Admin role ID var grafanaAdminRoleId = '22926164-76b3-42b3-bc55-97df8dab3e41' -// Subscription IDs for Azure Monitor access -var stagingSubscriptions = [ - { - name: 'DotNetProductConstructionServicesStaging' - id: 'e6b5f9f5-0ca4-4351-879b-014d78400ec2' - } - { - name: 'HelixStaging' - id: 'cab65fc3-d077-467d-931f-3932eabf36d3' - } - { - name: 'DncEngInternalTooling' - id: '84a65c9a-787d-45da-b10a-3a1cefce8060' - } - { - name: 'DotnetEngineeringServices' - id: 'a4fc5514-21a9-4296-bfaf-5c7ee7fa35d1' - } - { - name: 'Helix' - id: '68672ab8-de0c-40f1-8d1b-ffb20bd62c0f' - } -] - -var productionSubscriptions = [ - { - name: 'DotNetProductConstructionServices' - id: 'fbd6122a-9ad3-42e4-976e-bccb82486856' - } - { - name: 'HelixStaging' - id: 'cab65fc3-d077-467d-931f-3932eabf36d3' - } - { - name: 'DncEngInternalTooling' - id: '84a65c9a-787d-45da-b10a-3a1cefce8060' - } - { - name: 'DotnetEngineeringServices' - id: 'a4fc5514-21a9-4296-bfaf-5c7ee7fa35d1' - } - { - name: 'Helix' - id: '68672ab8-de0c-40f1-8d1b-ffb20bd62c0f' - } -] - -// Select subscription list based on environment -var monitoringSubscriptions = environment == 'Production' ? productionSubscriptions : stagingSubscriptions - resource grafanaKeyVaultSecretsOfficerRole 'Microsoft.Authorization/roleAssignments@2022-04-01' = { name: guid(grafanaKeyVault.id, grafanaUserAssignedIdentity.id, keyVaultSecretsOfficerRoleId) scope: grafanaKeyVault @@ -230,18 +180,6 @@ resource grafanaWorkspace 'Microsoft.Dashboard/grafana@2023-09-01' = { } } -// Grant Monitoring Reader role to Grafana managed identity on multiple subscriptions -// This allows Azure Monitor datasources to query metrics and logs -module grafanaMonitoringReaderRoles 'grafana-monitoring-reader.bicep' = [for sub in monitoringSubscriptions: { - name: 'monitoringReader-${sub.name}-${environment}' - scope: subscription(sub.id) - params: { - grafanaPrincipalId: grafanaUserAssignedIdentity.properties.principalId - environment: environment - identityResourceId: grafanaUserAssignedIdentity.id - } -}] - // Grant Grafana Admin role to .NET Engineering Services group resource dotnetEngServicesGrafanaAdminRole 'Microsoft.Authorization/roleAssignments@2022-04-01' = { name: guid(grafanaWorkspace.id, dotnetEngServicesGroupId, grafanaAdminRoleId) diff --git a/eng/deployment/grafana-monitoring-reader.bicep b/eng/deployment/grafana-monitoring-reader.bicep deleted file mode 100644 index e6a81befc..000000000 --- a/eng/deployment/grafana-monitoring-reader.bicep +++ /dev/null @@ -1,29 +0,0 @@ -// Module to grant Monitoring Reader role at subscription scope -// This must be deployed at subscription scope, so it's a separate module - -targetScope = 'subscription' - -@description('The principal ID of the Grafana managed identity') -param grafanaPrincipalId string - -@description('The deployment environment (for unique naming)') -param environment string - -@description('The managed identity resource ID (for unique naming)') -param identityResourceId string - -// Monitoring Reader role ID -var monitoringReaderRoleId = '43d0d8ad-25c7-4714-9337-8ba259a9fe05' - -// Grant Monitoring Reader role to Grafana managed identity -resource monitoringReaderRole 'Microsoft.Authorization/roleAssignments@2022-04-01' = { - name: guid(subscription().id, identityResourceId, monitoringReaderRoleId, environment) - properties: { - roleDefinitionId: subscriptionResourceId('Microsoft.Authorization/roleDefinitions', monitoringReaderRoleId) - principalId: grafanaPrincipalId - principalType: 'ServicePrincipal' - description: 'Grants Grafana managed identity read access to Azure Monitor resources for ${environment} environment' - } -} - -output roleAssignmentId string = monitoringReaderRole.id diff --git a/eng/provision-grafana.yaml b/eng/provision-grafana.yaml index 1d6f63197..2862a0bc9 100644 --- a/eng/provision-grafana.yaml +++ b/eng/provision-grafana.yaml @@ -174,6 +174,115 @@ jobs: Write-Host "ā„¹ļø Note: Azure RBAC permissions can take 5-10 minutes to propagate to Key Vault data plane" Write-Host "ā„¹ļø The PublishDashboards stage has retry logic to handle propagation delays" + - task: AzureCLI@2 + displayName: 'Grant Grafana Identity Monitoring Reader Access' + inputs: + azureSubscription: '${{ parameters.ServiceConnectionName }}' + scriptType: 'pscore' + scriptLocation: 'inlineScript' + inlineScript: | + Write-Host "==========================================" + Write-Host "Granting Monitoring Reader to Grafana Identity" + Write-Host "==========================================" + Write-Host "" + + $workspaceName = "${{ parameters.GrafanaWorkspaceName }}" + $rgName = "${{ parameters.GrafanaResourceGroup }}" + $environment = "${{ parameters.DeploymentEnvironment }}" + + # Get the user-assigned managed identity (name matches Bicep template) + $managedIdentityName = if ($environment -eq 'Production') { 'dnceng-managed-grafana' } else { 'dnceng-managed-grafana-staging' } + Write-Host "Retrieving managed identity: $managedIdentityName" + + $identity = az identity show --name $managedIdentityName --resource-group $rgName --query '{principalId:principalId, clientId:clientId}' --output json | ConvertFrom-Json + + if (-not $identity) { + Write-Error "Failed to retrieve managed identity: $managedIdentityName" + exit 1 + } + + $principalId = $identity.principalId + $clientId = $identity.clientId + + Write-Host "āœ“ Managed Identity: $managedIdentityName" + Write-Host "āœ“ Principal ID: $principalId" + Write-Host "āœ“ Client ID: $clientId" + Write-Host "" + + # Define common subscriptions shared across all environments + $commonSubscriptions = @( + @{name="HelixStaging"; id="cab65fc3-d077-467d-931f-3932eabf36d3"}, + @{name="dnceng-internaltooling"; id="84a65c9a-787d-45da-b10a-3a1cefce8060"}, + @{name="Dotnet Engineering services"; id="a4fc5514-21a9-4296-bfaf-5c7ee7fa35d1"}, + @{name="Helix"; id="68672ab8-de0c-40f1-8d1b-ffb20bd62c0f"} + ) + + # Add environment-specific subscriptions + $subscriptions = @() + if ($environment -eq "Staging") { + $subscriptions += @{name=".NET Product Construction Services - Staging"; id="e6b5f9f5-0ca4-4351-879b-014d78400ec2"} + } else { + $subscriptions += @{name=".NET Product Construction Services"; id="fbd6122a-9ad3-42e4-976e-bccb82486856"} + } + $subscriptions += $commonSubscriptions + + Write-Host "Granting Monitoring Reader role on $($subscriptions.Count) subscriptions..." + Write-Host "" + + $monitoringReaderRoleId = "43d0d8ad-25c7-4714-9337-8ba259a9fe05" + $successCount = 0 + $failCount = 0 + + foreach ($sub in $subscriptions) { + Write-Host "Processing: $($sub.name) ($($sub.id))" + + # Check if role assignment already exists + $existingAssignment = az role assignment list ` + --assignee $principalId ` + --role "Monitoring Reader" ` + --scope "/subscriptions/$($sub.id)" ` + --query "[0].id" ` + --output tsv 2>$null + + if ($existingAssignment) { + Write-Host " āœ“ Role assignment already exists" + $successCount++ + } else { + Write-Host " Creating role assignment..." + + $result = az role assignment create ` + --role "Monitoring Reader" ` + --assignee-object-id $principalId ` + --assignee-principal-type ServicePrincipal ` + --scope "/subscriptions/$($sub.id)" ` + --output none 2>&1 + + if ($LASTEXITCODE -eq 0) { + Write-Host " āœ“ Role assignment created successfully" + $successCount++ + } else { + Write-Warning " ⚠ Failed to create role assignment" + Write-Warning " Error: $result" + $failCount++ + } + } + Write-Host "" + } + + Write-Host "==========================================" + Write-Host "Role Assignment Summary" + Write-Host "==========================================" + Write-Host "āœ“ Successful: $successCount / $($subscriptions.Count)" + if ($failCount -gt 0) { + Write-Host "⚠ Failed: $failCount / $($subscriptions.Count)" + Write-Host "" + Write-Host "Note: Some failures may be due to lack of permissions on target subscriptions." + Write-Host "These role assignments may need to be granted manually by subscription owners." + } + Write-Host "" + Write-Host "ā„¹ļø RBAC propagation can take 2-5 minutes for Azure Monitor queries to work" + Write-Host "" + - task: AzureCLI@2 displayName: 'Install Azure Managed Grafana Extension' inputs: From 5d05dbe383b31d286bae09604a7579b2525cfb7d Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Wed, 12 Nov 2025 16:32:29 -0800 Subject: [PATCH 084/133] fix grafana notification contact points --- ...k.imageless.12h_reminder.notification.json | 11 ++--- .../statusHook.imageless.notification.json | 11 ++--- .../Production/statusHook.notification.json | 11 ++--- .../Production/teamsHook.notification.json | 10 ++--- ...k.imageless.12h_reminder.notification.json | 11 ++--- .../statusHook.imageless.notification.json | 11 ++--- .../Staging/statusHook.notification.json | 11 ++--- .../Staging/teamsHook.notification.json | 10 ++--- src/Monitoring/Sdk/DeployPublisher.cs | 35 +++++++++++++-- src/Monitoring/Sdk/GrafanaClient.cs | 45 +++++++++++++++++++ 10 files changed, 100 insertions(+), 66 deletions(-) diff --git a/src/Monitoring/Monitoring.ArcadeServices/notifications/Production/statusHook.imageless.12h_reminder.notification.json b/src/Monitoring/Monitoring.ArcadeServices/notifications/Production/statusHook.imageless.12h_reminder.notification.json index 8c6d7d3e6..cd4031247 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/notifications/Production/statusHook.imageless.12h_reminder.notification.json +++ b/src/Monitoring/Monitoring.ArcadeServices/notifications/Production/statusHook.imageless.12h_reminder.notification.json @@ -1,16 +1,11 @@ { "name": ".NET Status Alert (no image, 12h reminder)", "type": "webhook", - "isDefault": false, - "sendReminder": true, "disableResolveMessage": false, - "frequency": "12h", "settings": { - "autoResolve": true, - "httpMethod": "POST", - "password": "[vault(dotnet-build-bot-dotnet-eng-status-token)]", - "uploadImage": false, "url": "https://dotneteng-status.azurewebsites.net/api/alert", - "username": "ignored" + "httpMethod": "POST", + "username": "ignored", + "password": "[vault(dotnet-build-bot-dotnet-eng-status-token)]" } } \ No newline at end of file diff --git a/src/Monitoring/Monitoring.ArcadeServices/notifications/Production/statusHook.imageless.notification.json b/src/Monitoring/Monitoring.ArcadeServices/notifications/Production/statusHook.imageless.notification.json index a337a1271..5fefae928 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/notifications/Production/statusHook.imageless.notification.json +++ b/src/Monitoring/Monitoring.ArcadeServices/notifications/Production/statusHook.imageless.notification.json @@ -1,16 +1,11 @@ { "name": ".NET Status Alert (no image)", "type": "webhook", - "isDefault": false, - "sendReminder": false, "disableResolveMessage": false, - "frequency": "", "settings": { - "autoResolve": true, - "httpMethod": "POST", - "password": "[vault(dotnet-build-bot-dotnet-eng-status-token)]", - "uploadImage": false, "url": "https://dotneteng-status.azurewebsites.net/api/alert", - "username": "ignored" + "httpMethod": "POST", + "username": "ignored", + "password": "[vault(dotnet-build-bot-dotnet-eng-status-token)]" } } \ No newline at end of file diff --git a/src/Monitoring/Monitoring.ArcadeServices/notifications/Production/statusHook.notification.json b/src/Monitoring/Monitoring.ArcadeServices/notifications/Production/statusHook.notification.json index 34e223ac2..5d1c2d143 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/notifications/Production/statusHook.notification.json +++ b/src/Monitoring/Monitoring.ArcadeServices/notifications/Production/statusHook.notification.json @@ -1,16 +1,11 @@ { "name": ".NET Status Alert", "type": "webhook", - "isDefault": false, - "sendReminder": false, "disableResolveMessage": false, - "frequency": "", "settings": { - "autoResolve": true, - "httpMethod": "POST", - "password": "[vault(dotnet-build-bot-dotnet-eng-status-token)]", - "uploadImage": false, "url": "https://dotneteng-status.azurewebsites.net/api/alert", - "username": "ignored" + "httpMethod": "POST", + "username": "ignored", + "password": "[vault(dotnet-build-bot-dotnet-eng-status-token)]" } } \ No newline at end of file diff --git a/src/Monitoring/Monitoring.ArcadeServices/notifications/Production/teamsHook.notification.json b/src/Monitoring/Monitoring.ArcadeServices/notifications/Production/teamsHook.notification.json index f65218b6c..b8f49b778 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/notifications/Production/teamsHook.notification.json +++ b/src/Monitoring/Monitoring.ArcadeServices/notifications/Production/teamsHook.notification.json @@ -1,14 +1,10 @@ { "name": "Teams Alert", "type": "teams", - "isDefault": false, - "sendReminder": false, "disableResolveMessage": false, - "frequency": "", "settings": { - "autoResolve": true, - "httpMethod": "POST", - "uploadImage": false, - "url": "[vault(fr-bot-notifications-teams-notification-url)]" + "url": "[vault(fr-bot-notifications-teams-notification-url)]", + "message": "{{ template \"teams.default.message\" . }}", + "title": "{{ template \"teams.default.title\" . }}" } } \ No newline at end of file diff --git a/src/Monitoring/Monitoring.ArcadeServices/notifications/Staging/statusHook.imageless.12h_reminder.notification.json b/src/Monitoring/Monitoring.ArcadeServices/notifications/Staging/statusHook.imageless.12h_reminder.notification.json index 09aeb8713..875a5780c 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/notifications/Staging/statusHook.imageless.12h_reminder.notification.json +++ b/src/Monitoring/Monitoring.ArcadeServices/notifications/Staging/statusHook.imageless.12h_reminder.notification.json @@ -1,16 +1,11 @@ { "name": ".NET Status Alert (no image, 12h reminder)", "type": "webhook", - "isDefault": false, - "sendReminder": true, "disableResolveMessage": false, - "frequency": "12h", "settings": { - "autoResolve": true, - "httpMethod": "POST", - "password": "[vault(dotnet-build-bot-dotnet-eng-status-token)]", - "uploadImage": false, "url": "https://dotneteng-status-staging.azurewebsites.net/api/alert", - "username": "ignored" + "httpMethod": "POST", + "username": "ignored", + "password": "[vault(dotnet-build-bot-dotnet-eng-status-token)]" } } \ No newline at end of file diff --git a/src/Monitoring/Monitoring.ArcadeServices/notifications/Staging/statusHook.imageless.notification.json b/src/Monitoring/Monitoring.ArcadeServices/notifications/Staging/statusHook.imageless.notification.json index e677ea1ab..3ebd2831b 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/notifications/Staging/statusHook.imageless.notification.json +++ b/src/Monitoring/Monitoring.ArcadeServices/notifications/Staging/statusHook.imageless.notification.json @@ -1,16 +1,11 @@ { "name": ".NET Status Alert (no image)", "type": "webhook", - "isDefault": false, - "sendReminder": false, "disableResolveMessage": false, - "frequency": "", "settings": { - "autoResolve": true, - "httpMethod": "POST", - "password": "[vault(dotnet-build-bot-dotnet-eng-status-token)]", - "uploadImage": false, "url": "https://dotneteng-status-staging.azurewebsites.net/api/alert", - "username": "ignored" + "httpMethod": "POST", + "username": "ignored", + "password": "[vault(dotnet-build-bot-dotnet-eng-status-token)]" } } \ No newline at end of file diff --git a/src/Monitoring/Monitoring.ArcadeServices/notifications/Staging/statusHook.notification.json b/src/Monitoring/Monitoring.ArcadeServices/notifications/Staging/statusHook.notification.json index 61b1b74ce..895db63b7 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/notifications/Staging/statusHook.notification.json +++ b/src/Monitoring/Monitoring.ArcadeServices/notifications/Staging/statusHook.notification.json @@ -1,16 +1,11 @@ { "name": ".NET Status Alert", "type": "webhook", - "isDefault": false, - "sendReminder": false, "disableResolveMessage": false, - "frequency": "", "settings": { - "autoResolve": true, - "httpMethod": "POST", - "password": "[vault(dotnet-build-bot-dotnet-eng-status-token)]", - "uploadImage": false, "url": "https://dotneteng-status-staging.azurewebsites.net/api/alert", - "username": "ignored" + "httpMethod": "POST", + "username": "ignored", + "password": "[vault(dotnet-build-bot-dotnet-eng-status-token)]" } } \ No newline at end of file diff --git a/src/Monitoring/Monitoring.ArcadeServices/notifications/Staging/teamsHook.notification.json b/src/Monitoring/Monitoring.ArcadeServices/notifications/Staging/teamsHook.notification.json index f65218b6c..b8f49b778 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/notifications/Staging/teamsHook.notification.json +++ b/src/Monitoring/Monitoring.ArcadeServices/notifications/Staging/teamsHook.notification.json @@ -1,14 +1,10 @@ { "name": "Teams Alert", "type": "teams", - "isDefault": false, - "sendReminder": false, "disableResolveMessage": false, - "frequency": "", "settings": { - "autoResolve": true, - "httpMethod": "POST", - "uploadImage": false, - "url": "[vault(fr-bot-notifications-teams-notification-url)]" + "url": "[vault(fr-bot-notifications-teams-notification-url)]", + "message": "{{ template \"teams.default.message\" . }}", + "title": "{{ template \"teams.default.title\" . }}" } } \ No newline at end of file diff --git a/src/Monitoring/Sdk/DeployPublisher.cs b/src/Monitoring/Sdk/DeployPublisher.cs index 5a7e51583..ed0fb7cc1 100644 --- a/src/Monitoring/Sdk/DeployPublisher.cs +++ b/src/Monitoring/Sdk/DeployPublisher.cs @@ -62,10 +62,8 @@ public async Task PostToGrafanaAsync() { await PostDatasourcesAsync().ConfigureAwait(false); - // Skip notifications for Azure Managed Grafana - it uses Grafana Unified Alerting - // which has different APIs (/api/v1/provisioning/contact-points) - // TODO: Implement unified alerting support - // await PostNotificationsAsync().ConfigureAwait(false); + // Post contact points for unified alerting (Azure Managed Grafana) + await PostContactPointsAsync().ConfigureAwait(false); await PostDashboardsAsync().ConfigureAwait(false); } @@ -118,6 +116,35 @@ private async Task PostNotificationsAsync() } } + private async Task PostContactPointsAsync() + { + // Check if notification directory exists (optional feature) + if (!Directory.Exists(EnvironmentNotificationDirectory)) + { + Log.LogMessage(MessageImportance.Low, "No notification directory found at {0}, skipping contact points", EnvironmentNotificationDirectory); + return; + } + + foreach (string notificationPath in Directory.GetFiles(EnvironmentNotificationDirectory, + "*" + NotificationExtension, + SearchOption.AllDirectories)) + { + JObject data; + using (var sr = new StreamReader(notificationPath)) + using (var jr = new JsonTextReader(sr)) + { + data = await JObject.LoadAsync(jr).ConfigureAwait(false); + } + + string name = data.Value("name"); + Log.LogMessage(MessageImportance.Normal, "Posting contact point {0}...", name); + + await ReplaceVaultAsync(data); + + await GrafanaClient.CreateContactPointAsync(data).ConfigureAwait(false); + } + } + private async Task PostDashboardsAsync() { JArray folderArray = await GrafanaClient.ListFoldersAsync().ConfigureAwait(false); diff --git a/src/Monitoring/Sdk/GrafanaClient.cs b/src/Monitoring/Sdk/GrafanaClient.cs index 16103b192..0edab3df3 100644 --- a/src/Monitoring/Sdk/GrafanaClient.cs +++ b/src/Monitoring/Sdk/GrafanaClient.cs @@ -185,6 +185,31 @@ public Task CreateNotificationChannelAsync(JObject notificationChannel) ); } + public async Task CreateContactPointAsync(JObject contactPoint) + { + string name = contactPoint.Value("name"); + + // Check if contact point already exists by name + JObject existing = await GetContactPointAsync(name).ConfigureAwait(false); + + if (existing != null) + { + // Update existing contact point using PUT + var uri = new Uri(new Uri(_baseUrl), $"/api/v1/provisioning/contact-points/{Uri.EscapeDataString(name)}"); + + // Preserve the existing uid + contactPoint["uid"] = existing.Value("uid"); + + await SendObjectAsync(contactPoint, uri, HttpMethod.Put).ConfigureAwait(false); + } + else + { + // Create new contact point using POST + var uri = new Uri(new Uri(_baseUrl), "/api/v1/provisioning/contact-points"); + await SendObjectAsync(contactPoint, uri, HttpMethod.Post).ConfigureAwait(false); + } + } + private async Task CreateOrUpdateAsync( JObject data, TExternalId id, @@ -319,6 +344,26 @@ public async Task GetNotificationChannelAsync(string uid) } } + public async Task GetContactPointAsync(string name) + { + var uri = new Uri(new Uri(_baseUrl), $"/api/v1/provisioning/contact-points/{Uri.EscapeDataString(name)}"); + + using (HttpResponseMessage response = await _client.GetAsync(uri).ConfigureAwait(false)) + { + if (response.StatusCode == HttpStatusCode.NotFound) + return null; + + await response.EnsureSuccessWithContentAsync(); + + using (Stream stream = await response.Content.ReadAsStreamAsync().ConfigureAwait(false)) + using (var streamReader = new StreamReader(stream)) + using (var jsonReader = new JsonTextReader(streamReader)) + { + return await JObject.LoadAsync(jsonReader).ConfigureAwait(false); + } + } + } + public void Dispose() { _client?.Dispose(); From b30cf22c46927de32f237cbdbd715b1122f353fb Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Tue, 18 Nov 2025 10:15:51 -0800 Subject: [PATCH 085/133] azure managed grafana alert rules --- ...otneteng-status-failed-requests.alert.json | 140 ++++++++++++ .../helix-api-availability.alert.json | 162 ++++++++++++++ ...helix-api-average-response-time.alert.json | 151 +++++++++++++ ...elix-autoscaler-service-stopped.alert.json | 159 ++++++++++++++ .../pcs-background-worker-stopped.alert.json | 186 ++++++++++++++++ ...ontainer-job-execution-failures.alert.json | 140 ++++++++++++ .../pcs-disk-space-issues.alert.json | 140 ++++++++++++ .../Production/pcs-exceptions-high.alert.json | 139 ++++++++++++ .../pcs-git-push-success-rate.alert.json | 140 ++++++++++++ .../pcs-work-item-success-rate.alert.json | 201 ++++++++++++++++++ .../source-dot-net-availability.alert.json | 156 ++++++++++++++ .../alertrules/README.md | 199 +++++++++++++++++ ...otneteng-status-failed-requests.alert.json | 140 ++++++++++++ .../Staging/helix-api-availability.alert.json | 162 ++++++++++++++ ...helix-api-average-response-time.alert.json | 151 +++++++++++++ ...elix-autoscaler-service-stopped.alert.json | 159 ++++++++++++++ .../pcs-background-worker-stopped.alert.json | 186 ++++++++++++++++ ...ontainer-job-execution-failures.alert.json | 140 ++++++++++++ .../Staging/pcs-disk-space-issues.alert.json | 140 ++++++++++++ .../Staging/pcs-exceptions-high.alert.json | 139 ++++++++++++ .../pcs-git-push-success-rate.alert.json | 140 ++++++++++++ .../pcs-work-item-success-rate.alert.json | 194 +++++++++++++++++ .../source-dot-net-availability.alert.json | 156 ++++++++++++++ src/Monitoring/Sdk/DeployPublisher.cs | 60 ++++++ src/Monitoring/Sdk/DeployToolBase.cs | 6 + src/Monitoring/Sdk/GrafanaClient.cs | 40 ++++ 26 files changed, 3726 insertions(+) create mode 100644 src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/dotneteng-status-failed-requests.alert.json create mode 100644 src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/helix-api-availability.alert.json create mode 100644 src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/helix-api-average-response-time.alert.json create mode 100644 src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/helix-autoscaler-service-stopped.alert.json create mode 100644 src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-background-worker-stopped.alert.json create mode 100644 src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-container-job-execution-failures.alert.json create mode 100644 src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-disk-space-issues.alert.json create mode 100644 src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-exceptions-high.alert.json create mode 100644 src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-git-push-success-rate.alert.json create mode 100644 src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-work-item-success-rate.alert.json create mode 100644 src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/source-dot-net-availability.alert.json create mode 100644 src/Monitoring/Monitoring.ArcadeServices/alertrules/README.md create mode 100644 src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/dotneteng-status-failed-requests.alert.json create mode 100644 src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/helix-api-availability.alert.json create mode 100644 src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/helix-api-average-response-time.alert.json create mode 100644 src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/helix-autoscaler-service-stopped.alert.json create mode 100644 src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-background-worker-stopped.alert.json create mode 100644 src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-container-job-execution-failures.alert.json create mode 100644 src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-disk-space-issues.alert.json create mode 100644 src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-exceptions-high.alert.json create mode 100644 src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-git-push-success-rate.alert.json create mode 100644 src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-work-item-success-rate.alert.json create mode 100644 src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/source-dot-net-availability.alert.json diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/dotneteng-status-failed-requests.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/dotneteng-status-failed-requests.alert.json new file mode 100644 index 000000000..f0d19fefe --- /dev/null +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/dotneteng-status-failed-requests.alert.json @@ -0,0 +1,140 @@ +{ + "uid": "dotneteng-status-failed-requests", + "title": "DotNetEng Status Failed Requests/Hour alert", + "condition": "C", + "data": [ + { + "refId": "A", + "queryType": "Azure Log Analytics", + "datasourceUid": "F2XodEi7z", + "model": { + "azureLogAnalytics": { + "dashboardTime": true, + "query": "let r = requests | where $__timeFilter(timestamp);\nlet f = coalesce(toscalar(r | summarize min(timestamp)), ago(1d));\nlet t = coalesce(toscalar(r | summarize max(timestamp)), now());\nlet span=(t - f)/60;\nlet interval=case(span >= 1d, bin(span, 1d), span >= 1h, bin(span, 1h), 15m);\nlet intervalHours = interval / 1h;\nr\n| where success == false\n| make-series kind=nonempty valueCount=count() default=0 on timestamp in range(f, t, interval)\n| mv-expand timestamp to typeof(datetime), valueCount to typeof(double)\n| project timestamp, failuresCount=valueCount/intervalHours", + "resources": [ + "/subscriptions/68672ab8-de0c-40f1-8d1b-ffb20bd62c0f/resourceGroups/monitoring/providers/microsoft.insights/components/DotNetEng-Status-Prod" + ], + "resultFormat": "time_series", + "workspace": "/subscriptions/68672ab8-de0c-40f1-8d1b-ffb20bd62c0f/resourcegroups/defaultresourcegroup-eus/providers/microsoft.operationalinsights/workspaces/defaultworkspace-68672ab8-de0c-40f1-8d1b-ffb20bd62c0f-eus" + }, + "azureMonitor": { + "dimensionFilters": [], + "metricNamespace": "microsoft.insights/components", + "region": "eastus", + "resources": [ + { + "metricNamespace": "microsoft.insights/components", + "region": "eastus", + "resourceGroup": "monitoring", + "resourceName": "DotNetEng-Status-Prod", + "subscription": "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f" + } + ], + "timeGrain": "auto" + }, + "datasource": { + "type": "grafana-azure-monitor-datasource", + "uid": "F2XodEi7z" + }, + "queryType": "Azure Log Analytics", + "refId": "A", + "subscription": "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", + "subscriptions": [ + "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", + "cab65fc3-d077-467d-931f-3932eabf36d3" + ] + }, + "relativeTimeRange": { + "from": 86400, + "to": 0 + } + }, + { + "refId": "B", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "A", + "1h", + "now-10m" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "A", + "reducer": "mean", + "refId": "B", + "type": "reduce" + }, + "relativeTimeRange": { + "from": 3600, + "to": 600 + } + }, + { + "refId": "C", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [20], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": ["B"] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "B", + "refId": "C", + "type": "threshold" + } + } + ], + "noDataState": "OK", + "execErrState": "KeepLast", + "for": "5m", + "frequency": "1m", + "annotations": { + "description": "The number of failed DotNetEng Status requests per hour is above 20. This may indicate a systemic problem that needs to be investigated.\\nTo intially investigate prod, run the following query in DotNetEng-Status-Prod, and to investigate staging, run the query in DotNetEng-Status-Staging:\\n\\n```\\nunion exceptions, traces\\n| project timestamp, operation_Name, customDimensions, message, problemId, details\\n| order by timestamp asc\\n```" + }, + "labels": { + "NotificationId": "d2dd705a6c724ed68fcf6955561c06dd" + }, + "folderUID": "arcade-services", + "ruleGroup": "DotNetEng Status Alerts", + "intervalMs": 900000, + "isPaused": false, + "notification_settings": { + "receiver": ".NET Status Alert", + "group_by": ["alertname"], + "group_wait": "5m", + "repeat_interval": "4h" + } +} diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/helix-api-availability.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/helix-api-availability.alert.json new file mode 100644 index 000000000..991b04e21 --- /dev/null +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/helix-api-availability.alert.json @@ -0,0 +1,162 @@ +{ + "uid": "helix-api-availability", + "title": "Helix API availability", + "condition": "C", + "data": [ + { + "refId": "A", + "queryType": "Azure Monitor", + "datasourceUid": "F2XodEi7z", + "relativeTimeRange": { + "from": 86400, + "to": 0 + }, + "model": { + "azureMonitor": { + "aggregation": "Average", + "alias": "{{ availabilityresult/location }}", + "allowedTimeGrainsMs": [ + 60000, + 300000, + 900000, + 1800000, + 3600000, + 21600000, + 43200000, + 86400000 + ], + "dimensionFilter": "*", + "dimensionFilters": [ + { + "dimension": "availabilityResult/name", + "filter": "Helix API", + "operator": "eq" + }, + { + "dimension": "availabilityResult/location", + "filter": "*", + "operator": "eq" + } + ], + "metricDefinition": "microsoft.insights/components", + "metricName": "availabilityResults/availabilityPercentage", + "metricNamespace": "microsoft.insights/components", + "resources": [ + { + "metricNamespace": "microsoft.insights/components", + "resourceGroup": "helixinfrarg", + "resourceName": "helix-prod", + "subscription": "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f" + } + ], + "timeGrain": "auto", + "top": "10" + }, + "datasource": { + "type": "grafana-azure-monitor-datasource", + "uid": "F2XodEi7z" + }, + "queryType": "Azure Monitor", + "refId": "A", + "subscription": "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", + "subscriptions": [ + "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", + "cab65fc3-d077-467d-931f-3932eabf36d3" + ] + } + }, + { + "refId": "B", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "A", + "30m", + "now" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "A", + "reducer": "mean", + "refId": "B", + "type": "reduce" + }, + "relativeTimeRange": { + "from": 1800, + "to": 0 + } + }, + { + "refId": "C", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [ + 99 + ], + "type": "lt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "B" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "B", + "refId": "C", + "type": "threshold" + } + } + ], + "noDataState": "KeepLast", + "execErrState": "KeepLast", + "for": "5m", + "frequency": "1m", + "annotations": { + "description": "Helix API availability alert!" + }, + "labels": { + "NotificationId": "6179576701874a7abc440a574cf636d0" + }, + "folderUID": "arcade-services", + "ruleGroup": "Helix Alerts", + "intervalMs": 900000, + "isPaused": false, + "notification_settings": { + "receiver": ".NET Status Alert", + "group_by": [ + "alertname" + ], + "group_wait": "5m", + "repeat_interval": "4h" + } +} diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/helix-api-average-response-time.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/helix-api-average-response-time.alert.json new file mode 100644 index 000000000..dde9851b2 --- /dev/null +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/helix-api-average-response-time.alert.json @@ -0,0 +1,151 @@ +{ + "uid": "helix-api-average-response-time", + "title": "Helix API Average Response Time", + "condition": "C", + "data": [ + { + "refId": "A", + "queryType": "Azure Monitor", + "datasourceUid": "F2XodEi7z", + "relativeTimeRange": { + "from": 86400, + "to": 0 + }, + "model": { + "azureMonitor": { + "aggregation": "Average", + "alias": "", + "allowedTimeGrainsMs": [ + 60000, + 300000, + 900000, + 1800000, + 3600000, + 21600000, + 43200000, + 86400000 + ], + "dimensionFilter": "*", + "dimensionFilters": [], + "metricDefinition": "Microsoft.Insights/components", + "metricName": "requests/duration", + "metricNamespace": "microsoft.insights/components", + "resources": [ + { + "metricNamespace": "microsoft.insights/components", + "resourceGroup": "dotnet-eng-cluster", + "resourceName": "dotnet-eng", + "subscription": "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f" + } + ], + "timeGrain": "auto", + "top": "10" + }, + "datasource": { + "type": "grafana-azure-monitor-datasource", + "uid": "F2XodEi7z" + }, + "queryType": "Azure Monitor", + "refId": "A", + "subscription": "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", + "subscriptions": [ + "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", + "cab65fc3-d077-467d-931f-3932eabf36d3" + ] + } + }, + { + "refId": "B", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "A", + "5m", + "now" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "A", + "reducer": "mean", + "refId": "B", + "type": "reduce" + }, + "relativeTimeRange": { + "from": 300, + "to": 0 + } + }, + { + "refId": "C", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [ + 5000 + ], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "B" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "B", + "refId": "C", + "type": "threshold" + } + } + ], + "noDataState": "KeepLast", + "execErrState": "KeepLast", + "for": "5m", + "frequency": "1m", + "annotations": { + "description": "Helix API Average Response Time is high!" + }, + "labels": { + "NotificationId": "24cae10d9eca44079e7cf3d47f148497" + }, + "folderUID": "arcade-services", + "ruleGroup": "Helix Alerts", + "intervalMs": 900000, + "isPaused": false, + "notification_settings": { + "receiver": ".NET Status Alert", + "group_by": [ + "alertname" + ], + "group_wait": "5m", + "repeat_interval": "4h" + } +} diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/helix-autoscaler-service-stopped.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/helix-autoscaler-service-stopped.alert.json new file mode 100644 index 000000000..52876deb1 --- /dev/null +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/helix-autoscaler-service-stopped.alert.json @@ -0,0 +1,159 @@ +{ + "uid": "helix-autoscaler-service-stopped", + "title": "Helix AutoScaler Service Stopped Running", + "condition": "C", + "data": [ + { + "refId": "A", + "queryType": "Azure Monitor", + "datasourceUid": "F2XodEi7z", + "relativeTimeRange": { + "from": 86400, + "to": 0 + }, + "model": { + "azureMonitor": { + "aggregation": "Count", + "alias": "{{cloud/RoleName}}", + "allowedTimeGrainsMs": [ + 60000, + 300000, + 900000, + 1800000, + 3600000, + 21600000, + 43200000, + 86400000 + ], + "dimensionFilter": "*", + "dimensionFilters": [ + { + "dimension": "cloud/roleName", + "filter": "fabric:/Helix/AutoScaleActorService", + "operator": "eq" + } + ], + "metricDefinition": "Microsoft.Insights/components", + "metricName": "traces/count", + "metricNamespace": "microsoft.insights/components", + "region": "westus2", + "resources": [ + { + "metricNamespace": "microsoft.insights/components", + "region": "westus2", + "resourceGroup": "dotnet-eng-cluster", + "resourceName": "dotnet-eng", + "subscription": "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f" + } + ], + "timeGrain": "auto", + "top": "100" + }, + "datasource": { + "type": "grafana-azure-monitor-datasource", + "uid": "F2XodEi7z" + }, + "queryType": "Azure Monitor", + "refId": "A", + "subscription": "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", + "subscriptions": [ + "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", + "cab65fc3-d077-467d-931f-3932eabf36d3" + ] + } + }, + { + "refId": "B", + "queryType": "", + "datasourceUid": "-100", + "relativeTimeRange": { + "from": 1800, + "to": 0 + }, + "model": { + "conditions": [ + { + "evaluator": { + "params": [], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "A", + "30m", + "now" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "A", + "reducer": "mean", + "refId": "B", + "type": "reduce" + } + }, + { + "refId": "C", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [ + 1 + ], + "type": "lt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "B" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "B", + "refId": "C", + "type": "threshold" + } + } + ], + "noDataState": "Alerting", + "execErrState": "KeepLast", + "for": "5m", + "frequency": "1m", + "annotations": { + "description": "Helix AutoScaler Service has stopped running - no traces detected in the last 30 minutes." + }, + "labels": { + "NotificationId": "6213d3c5ce9a46278343bf075798e46f" + }, + "folderUID": "arcade-services", + "ruleGroup": "Helix Alerts", + "intervalMs": 900000, + "isPaused": false, + "notification_settings": { + "receiver": ".NET Status Alert", + "group_by": [ + "alertname" + ], + "group_wait": "5m", + "repeat_interval": "4h" + } +} diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-background-worker-stopped.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-background-worker-stopped.alert.json new file mode 100644 index 000000000..616f1afc6 --- /dev/null +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-background-worker-stopped.alert.json @@ -0,0 +1,186 @@ +{ + "uid": "pcs-background-worker-stopped", + "title": "PCS Background Worker Stopped", + "condition": "D", + "data": [ + { + "refId": "A", + "queryType": "Azure Log Analytics", + "datasourceUid": "F2XodEi7z", + "relativeTimeRange": { + "from": 86400, + "to": 0 + }, + "model": { + "azureLogAnalytics": { + "dashboardTime": true, + "query": "customEvents\r\n| where name == \"WorkItemExecuted\" and $__timeFilter(timestamp)\r\n| extend Type=tostring(customDimensions[\"WorkItemType\"])\r\n| summarize Count=count() by bin(timestamp, $__interval), Type=replace_string(Type, \"WorkItem\", \"\")\r\n| order by timestamp asc", + "resources": [ + "[parameter(product-construction-service-appinsights-resourcepath)]" + ], + "resultFormat": "time_series", + "workspace": "[parameter(product-construction-service-workspace-resourcepath)]" + }, + "azureMonitor": { + "dimensionFilters": [], + "metricNamespace": "microsoft.insights/components", + "region": "westus2", + "resources": [ + { + "metricNamespace": "microsoft.insights/components", + "region": "westus2", + "resourceGroup": "product-construction-service", + "resourceName": "product-construction-service-ai-int", + "subscription": "[parameter(dotnet-product-construction-services-subscriptionid)]" + } + ], + "timeGrain": "auto" + }, + "datasource": { + "type": "grafana-azure-monitor-datasource", + "uid": "F2XodEi7z" + }, + "queryType": "Azure Log Analytics", + "refId": "A", + "subscription": "[parameter(dotnet-eng-appinsights-subscriptionid)]", + "subscriptions": [ + "fbd6122a-9ad3-42e4-976e-bccb82486856", + "e6b5f9f5-0ca4-4351-879b-014d78400ec2" + ] + } + }, + { + "refId": "B", + "queryType": "Azure Log Analytics", + "datasourceUid": "F2XodEi7z", + "relativeTimeRange": { + "from": 86400, + "to": 0 + }, + "model": { + "azureLogAnalytics": { + "dashboardTime": true, + "query": "customEvents\r\n| where name == \"WorkItemExecuted\"\r\n| summarize TotalCount=count() by bin(timestamp, $__interval)\r\n| order by timestamp asc", + "resources": [ + "[parameter(product-construction-service-appinsights-resourcepath)]" + ], + "resultFormat": "time_series", + "workspace": "[parameter(product-construction-service-workspace-resourcepath)]" + }, + "azureMonitor": { + "dimensionFilters": [], + "hide": false, + "metricNamespace": "microsoft.insights/components", + "resources": [ + { + "metricNamespace": "microsoft.insights/components", + "resourceGroup": "product-construction-service", + "resourceName": "product-construction-service-ai-int", + "subscription": "[parameter(dotnet-product-construction-services-subscriptionid)]" + } + ], + "timeGrain": "auto" + }, + "datasource": { + "type": "grafana-azure-monitor-datasource", + "uid": "F2XodEi7z" + }, + "hide": false, + "queryType": "Azure Log Analytics", + "refId": "B", + "subscription": "[dotnet-eng-appinsights-subscriptionid)]", + "subscriptions": [ + "fbd6122a-9ad3-42e4-976e-bccb82486856", + "e6b5f9f5-0ca4-4351-879b-014d78400ec2" + ] + } + }, + { + "refId": "C", + "queryType": "", + "datasourceUid": "-100", + "relativeTimeRange": { + "from": 300, + "to": 0 + }, + "model": { + "conditions": [ + { + "evaluator": { + "params": [], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "B", + "5m", + "now" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "B", + "reducer": "mean", + "refId": "C", + "type": "reduce" + } + }, + { + "refId": "D", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [20], + "type": "lt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": ["C"] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "C", + "refId": "D", + "type": "threshold" + } + } + ], + "noDataState": "Alerting", + "execErrState": "Alerting", + "for": "5m", + "frequency": "1m", + "annotations": { + "description": "[!IMPORTANT]\n[Description and instructions for this alert](https://dev.azure.com/dnceng/internal/_wiki/wikis/DNCEng%20Services%20Wiki/1308/-Alert-PCS-Background-Worker-Stopped)\n\nPCS appears to have stopped processing new WorkItems.\n\n@dotnet/prodconsvcs" + }, + "labels": { + "NotificationId": "23909d48866646408f669cc1c3d325ee" + }, + "folderUID": "arcade-services", + "ruleGroup": "PCS Alerts", + "intervalMs": 900000, + "isPaused": false, + "notification_settings": { + "receiver": ".NET Status Alert", + "group_by": ["alertname"], + "group_wait": "5m", + "repeat_interval": "4h" + } +} diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-container-job-execution-failures.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-container-job-execution-failures.alert.json new file mode 100644 index 000000000..e3bf54bd1 --- /dev/null +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-container-job-execution-failures.alert.json @@ -0,0 +1,140 @@ +{ + "uid": "pcs-container-job-execution-failures", + "title": "Container job execution failures alert", + "condition": "C", + "data": [ + { + "refId": "A", + "queryType": "Azure Log Analytics", + "datasourceUid": "F2XodEi7z", + "relativeTimeRange": { + "from": 86400, + "to": 0 + }, + "model": { + "azureLogAnalytics": { + "dashboardTime": true, + "query": "ContainerAppSystemLogs_CL\r\n| where TimeGenerated > ago(14d)\r\n| where Log_s has_any (\"has exited with status Succeeded\", \"has exited with status Failed\")\r\n| summarize arg_max(TimeGenerated, Log_s) by JobName_s\r\n| where Log_s has \"has exited with status Failed\"\r\n| project TimeGenerated,JobName=JobName_s, FailedJob=1", + "resources": [ + "[parameter(product-construction-service-workspace-resourcepath)]" + ], + "resultFormat": "time_series", + "workspace": "[parameter(product-construction-service-workspace-resourcepath)]" + }, + "azureMonitor": { + "dimensionFilters": [], + "metricNamespace": "Microsoft.OperationalInsights/workspaces", + "region": "westus2", + "resources": [ + { + "metricNamespace": "Microsoft.OperationalInsights/workspaces", + "region": "westus2", + "resourceGroup": "product-construction-service", + "resourceName": "[parameter(product-construction-service-workspace-resourcename)]", + "subscription": "[parameter(dotnet-product-construction-services-subscriptionid)]" + } + ], + "timeGrain": "auto" + }, + "datasource": { + "type": "grafana-azure-monitor-datasource", + "uid": "F2XodEi7z" + }, + "queryType": "Azure Log Analytics", + "refId": "A", + "subscription": "[parameter(dotnet-product-construction-services-subscriptionid)]", + "subscriptions": [ + "fbd6122a-9ad3-42e4-976e-bccb82486856", + "e6b5f9f5-0ca4-4351-879b-014d78400ec2" + ] + } + }, + { + "refId": "B", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "A", + "5m", + "now" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "A", + "reducer": "max", + "refId": "B", + "type": "reduce" + }, + "relativeTimeRange": { + "from": 300, + "to": 0 + } + }, + { + "refId": "C", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [0], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": ["B"] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "B", + "refId": "C", + "type": "threshold" + } + } + ], + "noDataState": "OK", + "execErrState": "KeepLast", + "for": "5m", + "frequency": "1m", + "annotations": { + "description": "[Description and instructions for this alert](https://dev.azure.com/dnceng/internal/_wiki/wikis/DNCEng%20Services%20Wiki/1350/-Alert-PCS-container-job-execution-failing)\\n\\nPlease note that this alert will fire every 12 hours as the list of failed jobs can change" + }, + "labels": { + "NotificationId": "0a5c68b0daf846ef83a66c6c70fd24ad" + }, + "folderUID": "arcade-services", + "ruleGroup": "PCS Alerts", + "intervalMs": 900000, + "isPaused": false, + "notification_settings": { + "receiver": ".NET Status Alert (no image, 12h reminder)", + "group_by": ["alertname"], + "group_wait": "5m", + "repeat_interval": "12h" + } +} diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-disk-space-issues.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-disk-space-issues.alert.json new file mode 100644 index 000000000..04020ce81 --- /dev/null +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-disk-space-issues.alert.json @@ -0,0 +1,140 @@ +{ + "uid": "pcs-disk-space-issues", + "title": "PCS Disk Space Issues alert", + "condition": "C", + "data": [ + { + "refId": "A", + "queryType": "Azure Log Analytics", + "datasourceUid": "F2XodEi7z", + "relativeTimeRange": { + "from": 86400, + "to": 0 + }, + "model": { + "azureLogAnalytics": { + "dashboardTime": true, + "query": "exceptions\r\n| where timestamp > now()-6h and (outerMessage contains \"No space left on device\" or innermostMessage contains \"No space left on device\")\r\n| summarize TotalCount=count() by bin(timestamp, 1h)\r\n| order by timestamp asc", + "resources": [ + "[parameter(product-construction-service-appinsights-resourcepath)]" + ], + "resultFormat": "time_series", + "workspace": "[parameter(product-construction-service-workspace-resourcepath)]" + }, + "azureMonitor": { + "dimensionFilters": [], + "metricNamespace": "microsoft.insights/components", + "region": "westus2", + "resources": [ + { + "metricNamespace": "microsoft.insights/components", + "region": "westus2", + "resourceGroup": "product-construction-service", + "resourceName": "product-construction-service-ai-int", + "subscription": "[parameter(dotnet-product-construction-services-subscriptionid)]" + } + ], + "timeGrain": "auto" + }, + "datasource": { + "type": "grafana-azure-monitor-datasource", + "uid": "F2XodEi7z" + }, + "queryType": "Azure Log Analytics", + "refId": "A", + "subscription": "[parameter(dotnet-product-construction-services-subscriptionid)]", + "subscriptions": [ + "fbd6122a-9ad3-42e4-976e-bccb82486856", + "e6b5f9f5-0ca4-4351-879b-014d78400ec2" + ] + } + }, + { + "refId": "B", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "A", + "5m", + "now" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "A", + "reducer": "count", + "refId": "B", + "type": "reduce" + }, + "relativeTimeRange": { + "from": 300, + "to": 0 + } + }, + { + "refId": "C", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [0], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": ["B"] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "B", + "refId": "C", + "type": "threshold" + } + } + ], + "noDataState": "OK", + "execErrState": "KeepLast", + "for": "5m", + "frequency": "5m", + "annotations": { + "description": "[!IMPORTANT]\n[Description and instructions for this alert](https://dev.azure.com/dnceng/internal/_wiki/wikis/DNCEng%20Services%20Wiki/1346/-Alert-PCS-Disk-Space-Issues)\n\nThe PCS service is running out of disk space.\n\n@dotnet/prodconsvcs" + }, + "labels": { + "NotificationId": "aa1fe025a8954b6cad9866354ca041ee" + }, + "folderUID": "arcade-services", + "ruleGroup": "PCS Alerts", + "intervalMs": 900000, + "isPaused": false, + "notification_settings": { + "receiver": ".NET Status Alert", + "group_by": ["alertname"], + "group_wait": "5m", + "repeat_interval": "4h" + } +} diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-exceptions-high.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-exceptions-high.alert.json new file mode 100644 index 000000000..7e3902030 --- /dev/null +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-exceptions-high.alert.json @@ -0,0 +1,139 @@ +{ + "uid": "pcs-exceptions-high", + "title": "PCS Exceptions High", + "condition": "C", + "data": [ + { + "refId": "A", + "queryType": "Azure Log Analytics", + "datasourceUid": "F2XodEi7z", + "relativeTimeRange": { + "from": 86400, + "to": 0 + }, + "model": { + "azureLogAnalytics": { + "dashboardTime": true, + "query": "exceptions\r\n| where $__timeFilter(timestamp) and problemId !contains \"SpaDefaultPageMiddleware\"\r\n| summarize Exceptions=count() by bin(timestamp, $__interval)\r\n| order by timestamp asc", + "resources": [ + "[parameter(product-construction-service-appinsights-resourcepath)]" + ], + "resultFormat": "time_series", + "workspace": "[parameter(default-workspace-resourcepath)]" + }, + "azureMonitor": { + "dimensionFilters": [], + "hide": false, + "metricNamespace": "microsoft.insights/components", + "resources": [ + { + "metricNamespace": "microsoft.insights/components", + "resourceGroup": "product-construction-service", + "resourceName": "product-construction-service-ai-int", + "subscription": "[parameter(dotnet-product-construction-services-subscriptionid)]" + } + ], + "timeGrain": "auto" + }, + "datasource": { + "type": "grafana-azure-monitor-datasource", + "uid": "F2XodEi7z" + }, + "queryType": "Azure Log Analytics", + "refId": "A", + "subscription": "[parameter(dotnet-eng-appinsights-subscriptionid)]", + "subscriptions": [ + "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", + "cab65fc3-d077-467d-931f-3932eabf36d3" + ] + } + }, + { + "refId": "B", + "queryType": "", + "datasourceUid": "-100", + "relativeTimeRange": { + "from": 300, + "to": 0 + }, + "model": { + "conditions": [ + { + "evaluator": { + "params": [], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "A", + "5m", + "now" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "A", + "reducer": "mean", + "refId": "B", + "type": "reduce" + } + }, + { + "refId": "C", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [15], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": ["B"] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "B", + "refId": "C", + "type": "threshold" + } + } + ], + "noDataState": "OK", + "execErrState": "KeepLast", + "for": "5m", + "frequency": "1m", + "annotations": { + "description": "[!IMPORTANT]\n[Description and instructions for this alert](https://dev.azure.com/dnceng/internal/_wiki/wikis/DNCEng%20Services%20Wiki/1311/-Alert-PCS-Exceptions-High)\n\nThe PCS background work items started to fail frequently.\n\n@dotnet/prodconsvcs" + }, + "labels": { + "NotificationId": "08f669cc1c3d325ee488666464" + }, + "folderUID": "arcade-services", + "ruleGroup": "PCS Alerts", + "intervalMs": 900000, + "isPaused": false, + "notification_settings": { + "receiver": ".NET Status Alert", + "group_by": ["alertname"], + "group_wait": "5m", + "repeat_interval": "4h" + } +} diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-git-push-success-rate.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-git-push-success-rate.alert.json new file mode 100644 index 000000000..457990212 --- /dev/null +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-git-push-success-rate.alert.json @@ -0,0 +1,140 @@ +{ + "uid": "pcs-git-push-success-rate", + "title": "Git Push success rate alert", + "condition": "C", + "data": [ + { + "refId": "A", + "queryType": "Azure Log Analytics", + "datasourceUid": "F2XodEi7z", + "model": { + "azureLogAnalytics": { + "dashboardTime": true, + "query": "customEvents\r\n| where name == \"GitPush\" and $__timeFilter(timestamp)\r\n| extend Success = tobool(customDimensions[\"Success\"])\r\n| summarize SuccessRate=100*countif(Success == true)/count() by bin(timestamp, $__interval)\r\n| order by timestamp asc", + "resources": [ + "[parameter(product-construction-service-appinsights-resourcepath)]" + ], + "resultFormat": "time_series", + "workspace": "[parameter(product-construction-service-workspace-resourcepath)]" + }, + "azureMonitor": { + "dimensionFilters": [], + "metricNamespace": "microsoft.insights/components", + "region": "westus2", + "resources": [ + { + "metricNamespace": "microsoft.insights/components", + "region": "westus2", + "resourceGroup": "product-construction-service", + "resourceName": "product-construction-service-ai-int", + "subscription": "[parameter(dotnet-product-construction-services-subscriptionid)]" + } + ], + "timeGrain": "auto" + }, + "datasource": { + "type": "grafana-azure-monitor-datasource", + "uid": "F2XodEi7z" + }, + "queryType": "Azure Log Analytics", + "refId": "A", + "subscription": "[parameter(dotnet-product-construction-services-subscriptionid)]", + "subscriptions": [ + "fbd6122a-9ad3-42e4-976e-bccb82486856", + "e6b5f9f5-0ca4-4351-879b-014d78400ec2" + ] + }, + "relativeTimeRange": { + "from": 86400, + "to": 0 + } + }, + { + "refId": "B", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "A", + "5m", + "now" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "A", + "reducer": "mean", + "refId": "B", + "type": "reduce" + }, + "relativeTimeRange": { + "from": 300, + "to": 0 + } + }, + { + "refId": "C", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [80], + "type": "lt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": ["B"] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "B", + "refId": "C", + "type": "threshold" + } + } + ], + "noDataState": "KeepLast", + "execErrState": "KeepLast", + "for": "5m", + "frequency": "1m", + "annotations": { + "description": "[!IMPORTANT]\\n[Description and instructions for this alert](https://dev.azure.com/dnceng/internal/_wiki/wikis/DNCEng%20Services%20Wiki/1318/-Alert-PCS-high-git-push-failure-rate)\\n\\nPCS has a high `git push` failure rate, please investigate\\n\\n@dotnet/prodconsvcs" + }, + "labels": { + "NotificationId": "6ggqnvwrunnru1zfl4g42dn9qjzanb8a" + }, + "folderUID": "arcade-services", + "ruleGroup": "PCS Alerts", + "intervalSeconds": 60, + "isPaused": false, + "notification_settings": { + "receiver": ".NET Status Alert", + "group_by": ["alertname"], + "group_wait": "5m", + "repeat_interval": "4h" + } +} diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-work-item-success-rate.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-work-item-success-rate.alert.json new file mode 100644 index 000000000..1538aafdb --- /dev/null +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-work-item-success-rate.alert.json @@ -0,0 +1,201 @@ +{ + "uid": "pcs-work-item-success-rate", + "title": "PCS Work Item Success Rate alert", + "condition": "D", + "data": [ + { + "refId": "A", + "queryType": "Azure Log Analytics", + "datasourceUid": "F2XodEi7z", + "relativeTimeRange": { + "from": 86400, + "to": 0 + }, + "model": { + "azureLogAnalytics": { + "dashboardTime": true, + "query": "customEvents\r\n| where name == \"WorkItemExecuted\" and $__timeFilter(timestamp)\r\n| extend Success = tobool(customDimensions[\"Success\"])\r\n| summarize Successful = countif(Success == true), Failed = countif(Success == false) by bin(timestamp, $__interval)\r\n| order by timestamp asc", + "resources": [ + "[parameter(product-construction-service-appinsights-resourcepath)]" + ], + "resultFormat": "time_series", + "workspace": "[parameter(default-workspace-resourcepath)]" + }, + "azureMonitor": { + "dimensionFilters": [], + "metricNamespace": "microsoft.insights/components", + "region": "westus2", + "resources": [ + { + "metricNamespace": "microsoft.insights/components", + "region": "westus2", + "resourceGroup": "product-construction-service", + "resourceName": "product-construction-service-ai-int", + "subscription": "[parameter(dotnet-product-construction-services-subscriptionid)]" + } + ], + "timeGrain": "auto" + }, + "datasource": { + "type": "grafana-azure-monitor-datasource", + "uid": "F2XodEi7z" + }, + "queryType": "Azure Log Analytics", + "refId": "A", + "subscription": "[parameter(dotnet-eng-appinsights-subscriptionid)]", + "subscriptions": [ + "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", + "cab65fc3-d077-467d-931f-3932eabf36d3" + ], + "intervalMs": 300000 + } + }, + { + "refId": "B", + "queryType": "Azure Log Analytics", + "datasourceUid": "F2XodEi7z", + "model": { + "azureLogAnalytics": { + "dashboardTime": true, + "query": "customEvents\r\n| where name == \"WorkItemExecuted\" and $__timeFilter(timestamp)\r\n| extend Success = tobool(customDimensions[\"Success\"])\r\n| summarize SuccessRate=100*countif(Success == true)/count() by bin(timestamp, $__interval)\r\n| order by timestamp asc", + "resources": [ + "[parameter(product-construction-service-appinsights-resourcepath)]" + ], + "resultFormat": "time_series", + "workspace": "[parameter(default-workspace-resourcepath)]" + }, + "azureMonitor": { + "dimensionFilters": [], + "hide": false, + "metricNamespace": "microsoft.insights/components", + "region": "westus2", + "resources": [ + { + "metricNamespace": "microsoft.insights/components", + "region": "westus2", + "resourceGroup": "product-construction-service", + "resourceName": "product-construction-service-ai-int", + "subscription": "[parameter(dotnet-product-construction-services-subscriptionid)]" + } + ], + "timeGrain": "auto" + }, + "datasource": { + "type": "grafana-azure-monitor-datasource", + "uid": "F2XodEi7z" + }, + "hide": false, + "queryType": "Azure Log Analytics", + "refId": "B", + "subscription": "[parameter(dotnet-eng-appinsights-subscriptionid)]", + "subscriptions": [ + "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", + "cab65fc3-d077-467d-931f-3932eabf36d3" + ], + "intervalMs": 300000 + }, + "relativeTimeRange": { + "from": 86400, + "to": 0 + } + }, + { + "refId": "C", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "B", + "5m", + "now" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "B", + "reducer": "mean", + "refId": "C", + "type": "reduce", + "intervalMs": 300000 + }, + "relativeTimeRange": { + "from": 86400, + "to": 0 + } + }, + { + "refId": "D", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [ + 74 + ], + "type": "lt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "C" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "C", + "refId": "D", + "type": "threshold", + "intervalMs": 300000 + }, + "relativeTimeRange": { + "from": 86400, + "to": 0 + } + } + ], + "noDataState": "KeepLast", + "execErrState": "KeepLast", + "for": "5m", + "annotations": { + "description": "[!IMPORTANT]\\n[Description and instructions for this alert](https://dev.azure.com/dnceng/internal/_wiki/wikis/DNCEng%20Services%20Wiki/1310/-Alert-PCS-Work-Item-Success-Rate-alert)\\n\\nThe PCS background work items started to fail frequently.\\n\\n@dotnet/prodconsvcs" + }, + "labels": { + "NotificationId": "d71fe025a8954b6cad9866354ca041ee" + }, + "folderUID": "arcade-services", + "ruleGroup": "PCS Alerts", + "intervalSeconds": 60, + "isPaused": false, + "notification_settings": { + "receiver": ".NET Status Alert", + "group_by": [ + "alertname" + ], + "group_wait": "5m", + "repeat_interval": "4h" + } +} \ No newline at end of file diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/source-dot-net-availability.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/source-dot-net-availability.alert.json new file mode 100644 index 000000000..087546715 --- /dev/null +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/source-dot-net-availability.alert.json @@ -0,0 +1,156 @@ +{ + "uid": "source-dot-net-availability", + "title": "source.dot.net Availability", + "condition": "C", + "data": [ + { + "refId": "A", + "queryType": "Azure Monitor", + "datasourceUid": "F2XodEi7z", + "relativeTimeRange": { + "from": 86400, + "to": 0 + }, + "model": { + "azureMonitor": { + "aggregation": "Average", + "alias": "", + "allowedTimeGrainsMs": [ + 60000, + 300000, + 900000, + 1800000, + 3600000, + 21600000, + 43200000, + 86400000 + ], + "dimensionFilters": [ + { + "dimension": "availabilityResult/name", + "filter": "source-dot-net", + "operator": "eq" + } + ], + "metricDefinition": "Microsoft.Insights/components", + "metricName": "availabilityResults/availabilityPercentage", + "metricNamespace": "microsoft.insights/components", + "resources": [ + { + "metricNamespace": "microsoft.insights/components", + "resourceGroup": "dotnet-eng-cluster", + "resourceName": "dotnet-eng", + "subscription": "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f" + } + ], + "timeGrain": "auto", + "top": "10" + }, + "datasource": { + "type": "grafana-azure-monitor-datasource", + "uid": "F2XodEi7z" + }, + "queryType": "Azure Monitor", + "refId": "A", + "subscription": "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", + "subscriptions": [ + "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", + "cab65fc3-d077-467d-931f-3932eabf36d3" + ] + } + }, + { + "refId": "B", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "A", + "5m", + "now" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "A", + "reducer": "mean", + "refId": "B", + "type": "reduce" + }, + "relativeTimeRange": { + "from": 300, + "to": 0 + } + }, + { + "refId": "C", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [ + 60 + ], + "type": "lt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "B" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "B", + "refId": "C", + "type": "threshold" + } + } + ], + "noDataState": "KeepLast", + "execErrState": "KeepLast", + "for": "15m", + "frequency": "1m", + "annotations": { + "description": "source.dot.net availability is low!" + }, + "labels": { + "NotificationId": "fb8faaf7600740f98a1c2db076cd1712" + }, + "folderUID": "arcade-services", + "ruleGroup": "Source Browser Alerts", + "intervalSeconds": 900000, + "isPaused": false, + "notification_settings": { + "receiver": ".NET Status Alert", + "group_by": [ + "alertname" + ], + "group_wait": "5m", + "repeat_interval": "4h" + } +} diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/README.md b/src/Monitoring/Monitoring.ArcadeServices/alertrules/README.md new file mode 100644 index 000000000..86cc3ffbb --- /dev/null +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/README.md @@ -0,0 +1,199 @@ +# Alert Migration Status + +## āœ… Completed + +### SDK Implementation +- āœ… Added `CreateAlertRuleAsync()` to GrafanaClient.cs +- āœ… Added `PostAlertRulesAsync()` to DeployPublisher.cs +- āœ… Integrated alert rule provisioning into PublishGrafana pipeline +- āœ… Created alertrules directory structure + +### Alert Rules Created +1. āœ… `pcs-work-item-success-rate.alert.json` - Monitors PCS work item success rate, alerts when < 74% +2. āœ… `pcs-exceptions-high.alert.json` - Monitors exception count, alerts when > 15 exceptions + +## šŸ“‹ Remaining Alerts to Convert + +### From arcadeAvailability.dashboard.json +3. ā³ PCS Background Worker Stopped - Alerts when work item processing stops (< 20 items) +4. ā³ PCS Disk Space Issues alert - Monitors disk space availability +5. ā³ Git Push success rate alert - Tracks git operation success +6. ā³ Container job execution failures alert - Azure DevOps pipeline failures +7. ā³ Helix API availability - API health check +8. ā³ Helix API Average Response Time - Performance monitoring +9. ā³ Helix AutoScaler Service Stopped Running - Service health +10. ā³ DotNetEng Status Failed Requests/Hour alert - HTTP error tracking +11. ā³ source.dot.net Availability - Website uptime + +### From quota.dashboard.json +12. ā³ Alert 1 (TBD - need to extract) +13. ā³ Alert 2 (TBD - need to extract) +14. ā³ Alert 3 (TBD - need to extract) +15. ā³ Alert 4 (TBD - need to extract) + +## šŸ”„ Alert Migration Process + +Each alert requires: + +1. **Extract from dashboard JSON** + - Find the panel with `"alert": {}` block + - Extract `alert.name`, `alert.message`, `alert.conditions`, `alert.notifications` + - Extract `targets` array (queries) + +2. **Convert to unified alerting format** + - Create new `.alert.json` file with kebab-case uid + - Convert queries to `data` array + - Add reduce expression (refId: B) - extracts last value from time series + - Add threshold expression (refId: C) - applies condition + - Map state: `keep_state` → `KeepLast`, `ok` → `OK`, `alerting` → `Alerting` + - Convert `for` duration (e.g., "5m") + - Convert `frequency` to `intervalSeconds` (e.g., "1m" → 60) + - Move `alertRuleTags` to `labels` + - Move `message` to `annotations.description` + - Reference `folderUID`: "arcade-services" + +3. **Handle notifications** + - Legacy: `"notifications": [{"uid": "statusHook"}]` + - Unified: Grafana automatically routes based on notification policy + - Contact points already created: "statusHook", "Teams Alert", etc. + +4. **Create for both environments** + - Copy to `alertrules/Staging/` + - Copy to `alertrules/Production/` + - Parameters auto-replaced during deployment + +5. **Remove from dashboard** + - Delete entire `"alert": {}` block from panel + - Keep `thresholds` array for visual indicators + +## šŸŽÆ Example Alert Structure + +```json +{ + "uid": "alert-name-kebab-case", + "title": "Alert Display Name", + "condition": "C", + "data": [ + { + "refId": "A", + "queryType": "Azure Log Analytics", + "azureLogAnalytics": { + "query": "KQL query here", + "resource": "[parameter(...)]" + }, + "datasourceUid": "F2XodEi7z", + "relativeTimeRange": { + "from": 300, + "to": 0 + } + }, + { + "refId": "B", + "queryType": "", + "datasourceUid": "-100", + "model": { + "expression": "A", + "reducer": "last", + "type": "reduce" + } + }, + { + "refId": "C", + "queryType": "", + "datasourceUid": "-100", + "model": { + "expression": "B", + "type": "threshold", + "conditions": [{ + "evaluator": {"params": [threshold], "type": "lt|gt"}, + "type": "query" + }] + } + } + ], + "noDataState": "KeepLast|OK|NoData|Alerting", + "execErrState": "KeepLast|Alerting", + "for": "5m", + "annotations": { + "description": "Alert message with @mentions" + }, + "labels": { + "NotificationId": "unique-id" + }, + "folderUID": "arcade-services", + "ruleGroup": "PCS Alerts", + "intervalSeconds": 60, + "isPaused": false +} +``` + +## šŸš€ Testing Alert Rules + +After provisioning: + +1. **Verify in Grafana UI**: + ``` + Navigate to: Alerting → Alert rules + Expected: See "PCS Work Item Success Rate alert", "PCS Exceptions High" + ``` + +2. **Check alert evaluation**: + ``` + Each alert should show: + - State: OK / Firing / Pending / NoData + - Last evaluation time + - Next evaluation time + ``` + +3. **Test notifications**: + ``` + - Wait for alert to fire naturally, OR + - Temporarily lower threshold to trigger alert + - Verify notification sent to contact point + ``` + +4. **View alert history**: + ``` + Navigate to: Alerting → Alert instances + See firing history and state changes + ``` + +## šŸ“ Notes + +- Contact points (statusHook, Teams Alert) already created and working +- Notification routing happens automatically via notification policies +- Alert rules are independent of dashboards +- Can have multiple alerts on same query +- Supports complex multi-condition logic via expression queries + +## āš ļø Current State + +**IMPORTANT**: Only 2 of 15+ alerts have been migrated so far. The remaining alerts need to be converted following the same pattern as the two examples. + +The SDK is ready - it will automatically pick up any new `.alert.json` files added to the `alertrules/Staging/` or `alertrules/Production/` directories. + +## šŸ”§ Quick Reference + +**Convert frequency to seconds**: +- "1m" → 60 +- "5m" → 300 +- "1h" → 3600 + +**State mapping**: +- `keep_state` → `KeepLast` +- `alerting` → `Alerting` +- `ok` → `OK` +- `no_data` → `NoData` + +**Condition operators**: +- `lt` = less than (<) +- `gt` = greater than (>) +- `within_range` = between two values +- `outside_range` = outside range + +**Reducer functions**: +- `last` = most recent value +- `avg` = average +- `min` = minimum +- `max` = maximum +- `sum` = sum diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/dotneteng-status-failed-requests.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/dotneteng-status-failed-requests.alert.json new file mode 100644 index 000000000..f0d19fefe --- /dev/null +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/dotneteng-status-failed-requests.alert.json @@ -0,0 +1,140 @@ +{ + "uid": "dotneteng-status-failed-requests", + "title": "DotNetEng Status Failed Requests/Hour alert", + "condition": "C", + "data": [ + { + "refId": "A", + "queryType": "Azure Log Analytics", + "datasourceUid": "F2XodEi7z", + "model": { + "azureLogAnalytics": { + "dashboardTime": true, + "query": "let r = requests | where $__timeFilter(timestamp);\nlet f = coalesce(toscalar(r | summarize min(timestamp)), ago(1d));\nlet t = coalesce(toscalar(r | summarize max(timestamp)), now());\nlet span=(t - f)/60;\nlet interval=case(span >= 1d, bin(span, 1d), span >= 1h, bin(span, 1h), 15m);\nlet intervalHours = interval / 1h;\nr\n| where success == false\n| make-series kind=nonempty valueCount=count() default=0 on timestamp in range(f, t, interval)\n| mv-expand timestamp to typeof(datetime), valueCount to typeof(double)\n| project timestamp, failuresCount=valueCount/intervalHours", + "resources": [ + "/subscriptions/68672ab8-de0c-40f1-8d1b-ffb20bd62c0f/resourceGroups/monitoring/providers/microsoft.insights/components/DotNetEng-Status-Prod" + ], + "resultFormat": "time_series", + "workspace": "/subscriptions/68672ab8-de0c-40f1-8d1b-ffb20bd62c0f/resourcegroups/defaultresourcegroup-eus/providers/microsoft.operationalinsights/workspaces/defaultworkspace-68672ab8-de0c-40f1-8d1b-ffb20bd62c0f-eus" + }, + "azureMonitor": { + "dimensionFilters": [], + "metricNamespace": "microsoft.insights/components", + "region": "eastus", + "resources": [ + { + "metricNamespace": "microsoft.insights/components", + "region": "eastus", + "resourceGroup": "monitoring", + "resourceName": "DotNetEng-Status-Prod", + "subscription": "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f" + } + ], + "timeGrain": "auto" + }, + "datasource": { + "type": "grafana-azure-monitor-datasource", + "uid": "F2XodEi7z" + }, + "queryType": "Azure Log Analytics", + "refId": "A", + "subscription": "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", + "subscriptions": [ + "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", + "cab65fc3-d077-467d-931f-3932eabf36d3" + ] + }, + "relativeTimeRange": { + "from": 86400, + "to": 0 + } + }, + { + "refId": "B", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "A", + "1h", + "now-10m" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "A", + "reducer": "mean", + "refId": "B", + "type": "reduce" + }, + "relativeTimeRange": { + "from": 3600, + "to": 600 + } + }, + { + "refId": "C", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [20], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": ["B"] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "B", + "refId": "C", + "type": "threshold" + } + } + ], + "noDataState": "OK", + "execErrState": "KeepLast", + "for": "5m", + "frequency": "1m", + "annotations": { + "description": "The number of failed DotNetEng Status requests per hour is above 20. This may indicate a systemic problem that needs to be investigated.\\nTo intially investigate prod, run the following query in DotNetEng-Status-Prod, and to investigate staging, run the query in DotNetEng-Status-Staging:\\n\\n```\\nunion exceptions, traces\\n| project timestamp, operation_Name, customDimensions, message, problemId, details\\n| order by timestamp asc\\n```" + }, + "labels": { + "NotificationId": "d2dd705a6c724ed68fcf6955561c06dd" + }, + "folderUID": "arcade-services", + "ruleGroup": "DotNetEng Status Alerts", + "intervalMs": 900000, + "isPaused": false, + "notification_settings": { + "receiver": ".NET Status Alert", + "group_by": ["alertname"], + "group_wait": "5m", + "repeat_interval": "4h" + } +} diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/helix-api-availability.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/helix-api-availability.alert.json new file mode 100644 index 000000000..991b04e21 --- /dev/null +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/helix-api-availability.alert.json @@ -0,0 +1,162 @@ +{ + "uid": "helix-api-availability", + "title": "Helix API availability", + "condition": "C", + "data": [ + { + "refId": "A", + "queryType": "Azure Monitor", + "datasourceUid": "F2XodEi7z", + "relativeTimeRange": { + "from": 86400, + "to": 0 + }, + "model": { + "azureMonitor": { + "aggregation": "Average", + "alias": "{{ availabilityresult/location }}", + "allowedTimeGrainsMs": [ + 60000, + 300000, + 900000, + 1800000, + 3600000, + 21600000, + 43200000, + 86400000 + ], + "dimensionFilter": "*", + "dimensionFilters": [ + { + "dimension": "availabilityResult/name", + "filter": "Helix API", + "operator": "eq" + }, + { + "dimension": "availabilityResult/location", + "filter": "*", + "operator": "eq" + } + ], + "metricDefinition": "microsoft.insights/components", + "metricName": "availabilityResults/availabilityPercentage", + "metricNamespace": "microsoft.insights/components", + "resources": [ + { + "metricNamespace": "microsoft.insights/components", + "resourceGroup": "helixinfrarg", + "resourceName": "helix-prod", + "subscription": "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f" + } + ], + "timeGrain": "auto", + "top": "10" + }, + "datasource": { + "type": "grafana-azure-monitor-datasource", + "uid": "F2XodEi7z" + }, + "queryType": "Azure Monitor", + "refId": "A", + "subscription": "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", + "subscriptions": [ + "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", + "cab65fc3-d077-467d-931f-3932eabf36d3" + ] + } + }, + { + "refId": "B", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "A", + "30m", + "now" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "A", + "reducer": "mean", + "refId": "B", + "type": "reduce" + }, + "relativeTimeRange": { + "from": 1800, + "to": 0 + } + }, + { + "refId": "C", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [ + 99 + ], + "type": "lt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "B" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "B", + "refId": "C", + "type": "threshold" + } + } + ], + "noDataState": "KeepLast", + "execErrState": "KeepLast", + "for": "5m", + "frequency": "1m", + "annotations": { + "description": "Helix API availability alert!" + }, + "labels": { + "NotificationId": "6179576701874a7abc440a574cf636d0" + }, + "folderUID": "arcade-services", + "ruleGroup": "Helix Alerts", + "intervalMs": 900000, + "isPaused": false, + "notification_settings": { + "receiver": ".NET Status Alert", + "group_by": [ + "alertname" + ], + "group_wait": "5m", + "repeat_interval": "4h" + } +} diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/helix-api-average-response-time.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/helix-api-average-response-time.alert.json new file mode 100644 index 000000000..dde9851b2 --- /dev/null +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/helix-api-average-response-time.alert.json @@ -0,0 +1,151 @@ +{ + "uid": "helix-api-average-response-time", + "title": "Helix API Average Response Time", + "condition": "C", + "data": [ + { + "refId": "A", + "queryType": "Azure Monitor", + "datasourceUid": "F2XodEi7z", + "relativeTimeRange": { + "from": 86400, + "to": 0 + }, + "model": { + "azureMonitor": { + "aggregation": "Average", + "alias": "", + "allowedTimeGrainsMs": [ + 60000, + 300000, + 900000, + 1800000, + 3600000, + 21600000, + 43200000, + 86400000 + ], + "dimensionFilter": "*", + "dimensionFilters": [], + "metricDefinition": "Microsoft.Insights/components", + "metricName": "requests/duration", + "metricNamespace": "microsoft.insights/components", + "resources": [ + { + "metricNamespace": "microsoft.insights/components", + "resourceGroup": "dotnet-eng-cluster", + "resourceName": "dotnet-eng", + "subscription": "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f" + } + ], + "timeGrain": "auto", + "top": "10" + }, + "datasource": { + "type": "grafana-azure-monitor-datasource", + "uid": "F2XodEi7z" + }, + "queryType": "Azure Monitor", + "refId": "A", + "subscription": "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", + "subscriptions": [ + "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", + "cab65fc3-d077-467d-931f-3932eabf36d3" + ] + } + }, + { + "refId": "B", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "A", + "5m", + "now" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "A", + "reducer": "mean", + "refId": "B", + "type": "reduce" + }, + "relativeTimeRange": { + "from": 300, + "to": 0 + } + }, + { + "refId": "C", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [ + 5000 + ], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "B" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "B", + "refId": "C", + "type": "threshold" + } + } + ], + "noDataState": "KeepLast", + "execErrState": "KeepLast", + "for": "5m", + "frequency": "1m", + "annotations": { + "description": "Helix API Average Response Time is high!" + }, + "labels": { + "NotificationId": "24cae10d9eca44079e7cf3d47f148497" + }, + "folderUID": "arcade-services", + "ruleGroup": "Helix Alerts", + "intervalMs": 900000, + "isPaused": false, + "notification_settings": { + "receiver": ".NET Status Alert", + "group_by": [ + "alertname" + ], + "group_wait": "5m", + "repeat_interval": "4h" + } +} diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/helix-autoscaler-service-stopped.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/helix-autoscaler-service-stopped.alert.json new file mode 100644 index 000000000..52876deb1 --- /dev/null +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/helix-autoscaler-service-stopped.alert.json @@ -0,0 +1,159 @@ +{ + "uid": "helix-autoscaler-service-stopped", + "title": "Helix AutoScaler Service Stopped Running", + "condition": "C", + "data": [ + { + "refId": "A", + "queryType": "Azure Monitor", + "datasourceUid": "F2XodEi7z", + "relativeTimeRange": { + "from": 86400, + "to": 0 + }, + "model": { + "azureMonitor": { + "aggregation": "Count", + "alias": "{{cloud/RoleName}}", + "allowedTimeGrainsMs": [ + 60000, + 300000, + 900000, + 1800000, + 3600000, + 21600000, + 43200000, + 86400000 + ], + "dimensionFilter": "*", + "dimensionFilters": [ + { + "dimension": "cloud/roleName", + "filter": "fabric:/Helix/AutoScaleActorService", + "operator": "eq" + } + ], + "metricDefinition": "Microsoft.Insights/components", + "metricName": "traces/count", + "metricNamespace": "microsoft.insights/components", + "region": "westus2", + "resources": [ + { + "metricNamespace": "microsoft.insights/components", + "region": "westus2", + "resourceGroup": "dotnet-eng-cluster", + "resourceName": "dotnet-eng", + "subscription": "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f" + } + ], + "timeGrain": "auto", + "top": "100" + }, + "datasource": { + "type": "grafana-azure-monitor-datasource", + "uid": "F2XodEi7z" + }, + "queryType": "Azure Monitor", + "refId": "A", + "subscription": "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", + "subscriptions": [ + "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", + "cab65fc3-d077-467d-931f-3932eabf36d3" + ] + } + }, + { + "refId": "B", + "queryType": "", + "datasourceUid": "-100", + "relativeTimeRange": { + "from": 1800, + "to": 0 + }, + "model": { + "conditions": [ + { + "evaluator": { + "params": [], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "A", + "30m", + "now" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "A", + "reducer": "mean", + "refId": "B", + "type": "reduce" + } + }, + { + "refId": "C", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [ + 1 + ], + "type": "lt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "B" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "B", + "refId": "C", + "type": "threshold" + } + } + ], + "noDataState": "Alerting", + "execErrState": "KeepLast", + "for": "5m", + "frequency": "1m", + "annotations": { + "description": "Helix AutoScaler Service has stopped running - no traces detected in the last 30 minutes." + }, + "labels": { + "NotificationId": "6213d3c5ce9a46278343bf075798e46f" + }, + "folderUID": "arcade-services", + "ruleGroup": "Helix Alerts", + "intervalMs": 900000, + "isPaused": false, + "notification_settings": { + "receiver": ".NET Status Alert", + "group_by": [ + "alertname" + ], + "group_wait": "5m", + "repeat_interval": "4h" + } +} diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-background-worker-stopped.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-background-worker-stopped.alert.json new file mode 100644 index 000000000..616f1afc6 --- /dev/null +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-background-worker-stopped.alert.json @@ -0,0 +1,186 @@ +{ + "uid": "pcs-background-worker-stopped", + "title": "PCS Background Worker Stopped", + "condition": "D", + "data": [ + { + "refId": "A", + "queryType": "Azure Log Analytics", + "datasourceUid": "F2XodEi7z", + "relativeTimeRange": { + "from": 86400, + "to": 0 + }, + "model": { + "azureLogAnalytics": { + "dashboardTime": true, + "query": "customEvents\r\n| where name == \"WorkItemExecuted\" and $__timeFilter(timestamp)\r\n| extend Type=tostring(customDimensions[\"WorkItemType\"])\r\n| summarize Count=count() by bin(timestamp, $__interval), Type=replace_string(Type, \"WorkItem\", \"\")\r\n| order by timestamp asc", + "resources": [ + "[parameter(product-construction-service-appinsights-resourcepath)]" + ], + "resultFormat": "time_series", + "workspace": "[parameter(product-construction-service-workspace-resourcepath)]" + }, + "azureMonitor": { + "dimensionFilters": [], + "metricNamespace": "microsoft.insights/components", + "region": "westus2", + "resources": [ + { + "metricNamespace": "microsoft.insights/components", + "region": "westus2", + "resourceGroup": "product-construction-service", + "resourceName": "product-construction-service-ai-int", + "subscription": "[parameter(dotnet-product-construction-services-subscriptionid)]" + } + ], + "timeGrain": "auto" + }, + "datasource": { + "type": "grafana-azure-monitor-datasource", + "uid": "F2XodEi7z" + }, + "queryType": "Azure Log Analytics", + "refId": "A", + "subscription": "[parameter(dotnet-eng-appinsights-subscriptionid)]", + "subscriptions": [ + "fbd6122a-9ad3-42e4-976e-bccb82486856", + "e6b5f9f5-0ca4-4351-879b-014d78400ec2" + ] + } + }, + { + "refId": "B", + "queryType": "Azure Log Analytics", + "datasourceUid": "F2XodEi7z", + "relativeTimeRange": { + "from": 86400, + "to": 0 + }, + "model": { + "azureLogAnalytics": { + "dashboardTime": true, + "query": "customEvents\r\n| where name == \"WorkItemExecuted\"\r\n| summarize TotalCount=count() by bin(timestamp, $__interval)\r\n| order by timestamp asc", + "resources": [ + "[parameter(product-construction-service-appinsights-resourcepath)]" + ], + "resultFormat": "time_series", + "workspace": "[parameter(product-construction-service-workspace-resourcepath)]" + }, + "azureMonitor": { + "dimensionFilters": [], + "hide": false, + "metricNamespace": "microsoft.insights/components", + "resources": [ + { + "metricNamespace": "microsoft.insights/components", + "resourceGroup": "product-construction-service", + "resourceName": "product-construction-service-ai-int", + "subscription": "[parameter(dotnet-product-construction-services-subscriptionid)]" + } + ], + "timeGrain": "auto" + }, + "datasource": { + "type": "grafana-azure-monitor-datasource", + "uid": "F2XodEi7z" + }, + "hide": false, + "queryType": "Azure Log Analytics", + "refId": "B", + "subscription": "[dotnet-eng-appinsights-subscriptionid)]", + "subscriptions": [ + "fbd6122a-9ad3-42e4-976e-bccb82486856", + "e6b5f9f5-0ca4-4351-879b-014d78400ec2" + ] + } + }, + { + "refId": "C", + "queryType": "", + "datasourceUid": "-100", + "relativeTimeRange": { + "from": 300, + "to": 0 + }, + "model": { + "conditions": [ + { + "evaluator": { + "params": [], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "B", + "5m", + "now" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "B", + "reducer": "mean", + "refId": "C", + "type": "reduce" + } + }, + { + "refId": "D", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [20], + "type": "lt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": ["C"] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "C", + "refId": "D", + "type": "threshold" + } + } + ], + "noDataState": "Alerting", + "execErrState": "Alerting", + "for": "5m", + "frequency": "1m", + "annotations": { + "description": "[!IMPORTANT]\n[Description and instructions for this alert](https://dev.azure.com/dnceng/internal/_wiki/wikis/DNCEng%20Services%20Wiki/1308/-Alert-PCS-Background-Worker-Stopped)\n\nPCS appears to have stopped processing new WorkItems.\n\n@dotnet/prodconsvcs" + }, + "labels": { + "NotificationId": "23909d48866646408f669cc1c3d325ee" + }, + "folderUID": "arcade-services", + "ruleGroup": "PCS Alerts", + "intervalMs": 900000, + "isPaused": false, + "notification_settings": { + "receiver": ".NET Status Alert", + "group_by": ["alertname"], + "group_wait": "5m", + "repeat_interval": "4h" + } +} diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-container-job-execution-failures.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-container-job-execution-failures.alert.json new file mode 100644 index 000000000..e3bf54bd1 --- /dev/null +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-container-job-execution-failures.alert.json @@ -0,0 +1,140 @@ +{ + "uid": "pcs-container-job-execution-failures", + "title": "Container job execution failures alert", + "condition": "C", + "data": [ + { + "refId": "A", + "queryType": "Azure Log Analytics", + "datasourceUid": "F2XodEi7z", + "relativeTimeRange": { + "from": 86400, + "to": 0 + }, + "model": { + "azureLogAnalytics": { + "dashboardTime": true, + "query": "ContainerAppSystemLogs_CL\r\n| where TimeGenerated > ago(14d)\r\n| where Log_s has_any (\"has exited with status Succeeded\", \"has exited with status Failed\")\r\n| summarize arg_max(TimeGenerated, Log_s) by JobName_s\r\n| where Log_s has \"has exited with status Failed\"\r\n| project TimeGenerated,JobName=JobName_s, FailedJob=1", + "resources": [ + "[parameter(product-construction-service-workspace-resourcepath)]" + ], + "resultFormat": "time_series", + "workspace": "[parameter(product-construction-service-workspace-resourcepath)]" + }, + "azureMonitor": { + "dimensionFilters": [], + "metricNamespace": "Microsoft.OperationalInsights/workspaces", + "region": "westus2", + "resources": [ + { + "metricNamespace": "Microsoft.OperationalInsights/workspaces", + "region": "westus2", + "resourceGroup": "product-construction-service", + "resourceName": "[parameter(product-construction-service-workspace-resourcename)]", + "subscription": "[parameter(dotnet-product-construction-services-subscriptionid)]" + } + ], + "timeGrain": "auto" + }, + "datasource": { + "type": "grafana-azure-monitor-datasource", + "uid": "F2XodEi7z" + }, + "queryType": "Azure Log Analytics", + "refId": "A", + "subscription": "[parameter(dotnet-product-construction-services-subscriptionid)]", + "subscriptions": [ + "fbd6122a-9ad3-42e4-976e-bccb82486856", + "e6b5f9f5-0ca4-4351-879b-014d78400ec2" + ] + } + }, + { + "refId": "B", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "A", + "5m", + "now" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "A", + "reducer": "max", + "refId": "B", + "type": "reduce" + }, + "relativeTimeRange": { + "from": 300, + "to": 0 + } + }, + { + "refId": "C", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [0], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": ["B"] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "B", + "refId": "C", + "type": "threshold" + } + } + ], + "noDataState": "OK", + "execErrState": "KeepLast", + "for": "5m", + "frequency": "1m", + "annotations": { + "description": "[Description and instructions for this alert](https://dev.azure.com/dnceng/internal/_wiki/wikis/DNCEng%20Services%20Wiki/1350/-Alert-PCS-container-job-execution-failing)\\n\\nPlease note that this alert will fire every 12 hours as the list of failed jobs can change" + }, + "labels": { + "NotificationId": "0a5c68b0daf846ef83a66c6c70fd24ad" + }, + "folderUID": "arcade-services", + "ruleGroup": "PCS Alerts", + "intervalMs": 900000, + "isPaused": false, + "notification_settings": { + "receiver": ".NET Status Alert (no image, 12h reminder)", + "group_by": ["alertname"], + "group_wait": "5m", + "repeat_interval": "12h" + } +} diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-disk-space-issues.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-disk-space-issues.alert.json new file mode 100644 index 000000000..04020ce81 --- /dev/null +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-disk-space-issues.alert.json @@ -0,0 +1,140 @@ +{ + "uid": "pcs-disk-space-issues", + "title": "PCS Disk Space Issues alert", + "condition": "C", + "data": [ + { + "refId": "A", + "queryType": "Azure Log Analytics", + "datasourceUid": "F2XodEi7z", + "relativeTimeRange": { + "from": 86400, + "to": 0 + }, + "model": { + "azureLogAnalytics": { + "dashboardTime": true, + "query": "exceptions\r\n| where timestamp > now()-6h and (outerMessage contains \"No space left on device\" or innermostMessage contains \"No space left on device\")\r\n| summarize TotalCount=count() by bin(timestamp, 1h)\r\n| order by timestamp asc", + "resources": [ + "[parameter(product-construction-service-appinsights-resourcepath)]" + ], + "resultFormat": "time_series", + "workspace": "[parameter(product-construction-service-workspace-resourcepath)]" + }, + "azureMonitor": { + "dimensionFilters": [], + "metricNamespace": "microsoft.insights/components", + "region": "westus2", + "resources": [ + { + "metricNamespace": "microsoft.insights/components", + "region": "westus2", + "resourceGroup": "product-construction-service", + "resourceName": "product-construction-service-ai-int", + "subscription": "[parameter(dotnet-product-construction-services-subscriptionid)]" + } + ], + "timeGrain": "auto" + }, + "datasource": { + "type": "grafana-azure-monitor-datasource", + "uid": "F2XodEi7z" + }, + "queryType": "Azure Log Analytics", + "refId": "A", + "subscription": "[parameter(dotnet-product-construction-services-subscriptionid)]", + "subscriptions": [ + "fbd6122a-9ad3-42e4-976e-bccb82486856", + "e6b5f9f5-0ca4-4351-879b-014d78400ec2" + ] + } + }, + { + "refId": "B", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "A", + "5m", + "now" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "A", + "reducer": "count", + "refId": "B", + "type": "reduce" + }, + "relativeTimeRange": { + "from": 300, + "to": 0 + } + }, + { + "refId": "C", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [0], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": ["B"] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "B", + "refId": "C", + "type": "threshold" + } + } + ], + "noDataState": "OK", + "execErrState": "KeepLast", + "for": "5m", + "frequency": "5m", + "annotations": { + "description": "[!IMPORTANT]\n[Description and instructions for this alert](https://dev.azure.com/dnceng/internal/_wiki/wikis/DNCEng%20Services%20Wiki/1346/-Alert-PCS-Disk-Space-Issues)\n\nThe PCS service is running out of disk space.\n\n@dotnet/prodconsvcs" + }, + "labels": { + "NotificationId": "aa1fe025a8954b6cad9866354ca041ee" + }, + "folderUID": "arcade-services", + "ruleGroup": "PCS Alerts", + "intervalMs": 900000, + "isPaused": false, + "notification_settings": { + "receiver": ".NET Status Alert", + "group_by": ["alertname"], + "group_wait": "5m", + "repeat_interval": "4h" + } +} diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-exceptions-high.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-exceptions-high.alert.json new file mode 100644 index 000000000..7e3902030 --- /dev/null +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-exceptions-high.alert.json @@ -0,0 +1,139 @@ +{ + "uid": "pcs-exceptions-high", + "title": "PCS Exceptions High", + "condition": "C", + "data": [ + { + "refId": "A", + "queryType": "Azure Log Analytics", + "datasourceUid": "F2XodEi7z", + "relativeTimeRange": { + "from": 86400, + "to": 0 + }, + "model": { + "azureLogAnalytics": { + "dashboardTime": true, + "query": "exceptions\r\n| where $__timeFilter(timestamp) and problemId !contains \"SpaDefaultPageMiddleware\"\r\n| summarize Exceptions=count() by bin(timestamp, $__interval)\r\n| order by timestamp asc", + "resources": [ + "[parameter(product-construction-service-appinsights-resourcepath)]" + ], + "resultFormat": "time_series", + "workspace": "[parameter(default-workspace-resourcepath)]" + }, + "azureMonitor": { + "dimensionFilters": [], + "hide": false, + "metricNamespace": "microsoft.insights/components", + "resources": [ + { + "metricNamespace": "microsoft.insights/components", + "resourceGroup": "product-construction-service", + "resourceName": "product-construction-service-ai-int", + "subscription": "[parameter(dotnet-product-construction-services-subscriptionid)]" + } + ], + "timeGrain": "auto" + }, + "datasource": { + "type": "grafana-azure-monitor-datasource", + "uid": "F2XodEi7z" + }, + "queryType": "Azure Log Analytics", + "refId": "A", + "subscription": "[parameter(dotnet-eng-appinsights-subscriptionid)]", + "subscriptions": [ + "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", + "cab65fc3-d077-467d-931f-3932eabf36d3" + ] + } + }, + { + "refId": "B", + "queryType": "", + "datasourceUid": "-100", + "relativeTimeRange": { + "from": 300, + "to": 0 + }, + "model": { + "conditions": [ + { + "evaluator": { + "params": [], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "A", + "5m", + "now" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "A", + "reducer": "mean", + "refId": "B", + "type": "reduce" + } + }, + { + "refId": "C", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [15], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": ["B"] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "B", + "refId": "C", + "type": "threshold" + } + } + ], + "noDataState": "OK", + "execErrState": "KeepLast", + "for": "5m", + "frequency": "1m", + "annotations": { + "description": "[!IMPORTANT]\n[Description and instructions for this alert](https://dev.azure.com/dnceng/internal/_wiki/wikis/DNCEng%20Services%20Wiki/1311/-Alert-PCS-Exceptions-High)\n\nThe PCS background work items started to fail frequently.\n\n@dotnet/prodconsvcs" + }, + "labels": { + "NotificationId": "08f669cc1c3d325ee488666464" + }, + "folderUID": "arcade-services", + "ruleGroup": "PCS Alerts", + "intervalMs": 900000, + "isPaused": false, + "notification_settings": { + "receiver": ".NET Status Alert", + "group_by": ["alertname"], + "group_wait": "5m", + "repeat_interval": "4h" + } +} diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-git-push-success-rate.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-git-push-success-rate.alert.json new file mode 100644 index 000000000..457990212 --- /dev/null +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-git-push-success-rate.alert.json @@ -0,0 +1,140 @@ +{ + "uid": "pcs-git-push-success-rate", + "title": "Git Push success rate alert", + "condition": "C", + "data": [ + { + "refId": "A", + "queryType": "Azure Log Analytics", + "datasourceUid": "F2XodEi7z", + "model": { + "azureLogAnalytics": { + "dashboardTime": true, + "query": "customEvents\r\n| where name == \"GitPush\" and $__timeFilter(timestamp)\r\n| extend Success = tobool(customDimensions[\"Success\"])\r\n| summarize SuccessRate=100*countif(Success == true)/count() by bin(timestamp, $__interval)\r\n| order by timestamp asc", + "resources": [ + "[parameter(product-construction-service-appinsights-resourcepath)]" + ], + "resultFormat": "time_series", + "workspace": "[parameter(product-construction-service-workspace-resourcepath)]" + }, + "azureMonitor": { + "dimensionFilters": [], + "metricNamespace": "microsoft.insights/components", + "region": "westus2", + "resources": [ + { + "metricNamespace": "microsoft.insights/components", + "region": "westus2", + "resourceGroup": "product-construction-service", + "resourceName": "product-construction-service-ai-int", + "subscription": "[parameter(dotnet-product-construction-services-subscriptionid)]" + } + ], + "timeGrain": "auto" + }, + "datasource": { + "type": "grafana-azure-monitor-datasource", + "uid": "F2XodEi7z" + }, + "queryType": "Azure Log Analytics", + "refId": "A", + "subscription": "[parameter(dotnet-product-construction-services-subscriptionid)]", + "subscriptions": [ + "fbd6122a-9ad3-42e4-976e-bccb82486856", + "e6b5f9f5-0ca4-4351-879b-014d78400ec2" + ] + }, + "relativeTimeRange": { + "from": 86400, + "to": 0 + } + }, + { + "refId": "B", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "A", + "5m", + "now" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "A", + "reducer": "mean", + "refId": "B", + "type": "reduce" + }, + "relativeTimeRange": { + "from": 300, + "to": 0 + } + }, + { + "refId": "C", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [80], + "type": "lt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": ["B"] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "B", + "refId": "C", + "type": "threshold" + } + } + ], + "noDataState": "KeepLast", + "execErrState": "KeepLast", + "for": "5m", + "frequency": "1m", + "annotations": { + "description": "[!IMPORTANT]\\n[Description and instructions for this alert](https://dev.azure.com/dnceng/internal/_wiki/wikis/DNCEng%20Services%20Wiki/1318/-Alert-PCS-high-git-push-failure-rate)\\n\\nPCS has a high `git push` failure rate, please investigate\\n\\n@dotnet/prodconsvcs" + }, + "labels": { + "NotificationId": "6ggqnvwrunnru1zfl4g42dn9qjzanb8a" + }, + "folderUID": "arcade-services", + "ruleGroup": "PCS Alerts", + "intervalSeconds": 60, + "isPaused": false, + "notification_settings": { + "receiver": ".NET Status Alert", + "group_by": ["alertname"], + "group_wait": "5m", + "repeat_interval": "4h" + } +} diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-work-item-success-rate.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-work-item-success-rate.alert.json new file mode 100644 index 000000000..acba7015a --- /dev/null +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-work-item-success-rate.alert.json @@ -0,0 +1,194 @@ +{ + "uid": "pcs-work-item-success-rate", + "title": "PCS Work Item Success Rate alert", + "condition": "D", + "data": [ + { + "refId": "A", + "queryType": "Azure Log Analytics", + "datasourceUid": "F2XodEi7z", + "relativeTimeRange": { + "from": 86400, + "to": 0 + }, + "model": { + "azureLogAnalytics": { + "dashboardTime": true, + "query": "customEvents\r\n| where name == \"WorkItemExecuted\" and $__timeFilter(timestamp)\r\n| extend Success = tobool(customDimensions[\"Success\"])\r\n| summarize Successful = countif(Success == true), Failed = countif(Success == false) by bin(timestamp, $__interval)\r\n| order by timestamp asc", + "resources": [ + "[parameter(product-construction-service-appinsights-resourcepath)]" + ], + "resultFormat": "time_series", + "workspace": "[parameter(default-workspace-resourcepath)]" + }, + "azureMonitor": { + "dimensionFilters": [], + "metricNamespace": "microsoft.insights/components", + "region": "westus2", + "resources": [ + { + "metricNamespace": "microsoft.insights/components", + "region": "westus2", + "resourceGroup": "product-construction-service", + "resourceName": "product-construction-service-ai-int", + "subscription": "[parameter(dotnet-product-construction-services-subscriptionid)]" + } + ], + "timeGrain": "auto" + }, + "datasource": { + "type": "grafana-azure-monitor-datasource", + "uid": "F2XodEi7z" + }, + "queryType": "Azure Log Analytics", + "refId": "A", + "subscription": "[parameter(dotnet-eng-appinsights-subscriptionid)]", + "subscriptions": [ + "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", + "cab65fc3-d077-467d-931f-3932eabf36d3" + ] + } + }, + { + "refId": "B", + "queryType": "Azure Log Analytics", + "datasourceUid": "F2XodEi7z", + "model": { + "azureLogAnalytics": { + "dashboardTime": true, + "query": "customEvents\r\n| where name == \"WorkItemExecuted\" and $__timeFilter(timestamp)\r\n| extend Success = tobool(customDimensions[\"Success\"])\r\n| summarize SuccessRate=100*countif(Success == true)/count() by bin(timestamp, $__interval)\r\n| order by timestamp asc", + "resources": [ + "[parameter(product-construction-service-appinsights-resourcepath)]" + ], + "resultFormat": "time_series", + "workspace": "[parameter(default-workspace-resourcepath)]" + }, + "azureMonitor": { + "dimensionFilters": [], + "hide": false, + "metricNamespace": "microsoft.insights/components", + "region": "westus2", + "resources": [ + { + "metricNamespace": "microsoft.insights/components", + "region": "westus2", + "resourceGroup": "product-construction-service", + "resourceName": "product-construction-service-ai-int", + "subscription": "[parameter(dotnet-product-construction-services-subscriptionid)]" + } + ], + "timeGrain": "auto" + }, + "datasource": { + "type": "grafana-azure-monitor-datasource", + "uid": "F2XodEi7z" + }, + "hide": false, + "queryType": "Azure Log Analytics", + "refId": "B", + "subscription": "[parameter(dotnet-eng-appinsights-subscriptionid)]", + "subscriptions": [ + "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", + "cab65fc3-d077-467d-931f-3932eabf36d3" + ] + }, + "relativeTimeRange": { + "from": 86400, + "to": 0 + } + }, + { + "refId": "C", + "queryType": "", + "datasourceUid": "-100", + "relativeTimeRange": { + "from": 300, + "to": 0 + }, + "model": { + "conditions": [ + { + "evaluator": { + "params": [], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "B", + "5m", + "now" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "B", + "reducer": "mean", + "refId": "C", + "type": "reduce" + } + }, + { + "refId": "D", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [ + 74 + ], + "type": "lt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "C" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "C", + "refId": "D", + "type": "threshold" + } + } + ], + "noDataState": "KeepLast", + "execErrState": "KeepLast", + "for": "5m", + "frequency": "1m", + "annotations": { + "description": "[!IMPORTANT]\n[Description and instructions for this alert](https://dev.azure.com/dnceng/internal/_wiki/wikis/DNCEng%20Services%20Wiki/1310/-Alert-PCS-Work-Item-Success-Rate-alert)\n\nThe PCS background work items started to fail frequently.\n\n@dotnet/prodconsvcs" + }, + "labels": { + "NotificationId": "d71fe025a8954b6cad9866354ca041ee" + }, + "folderUID": "arcade-services", + "ruleGroup": "PCS Alerts", + "intervalMs": 900000, + "isPaused": false, + "notification_settings": { + "receiver": ".NET Status Alert", + "group_by": [ + "alertname" + ], + "group_wait": "5m", + "repeat_interval": "4h" + } +} \ No newline at end of file diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/source-dot-net-availability.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/source-dot-net-availability.alert.json new file mode 100644 index 000000000..087546715 --- /dev/null +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/source-dot-net-availability.alert.json @@ -0,0 +1,156 @@ +{ + "uid": "source-dot-net-availability", + "title": "source.dot.net Availability", + "condition": "C", + "data": [ + { + "refId": "A", + "queryType": "Azure Monitor", + "datasourceUid": "F2XodEi7z", + "relativeTimeRange": { + "from": 86400, + "to": 0 + }, + "model": { + "azureMonitor": { + "aggregation": "Average", + "alias": "", + "allowedTimeGrainsMs": [ + 60000, + 300000, + 900000, + 1800000, + 3600000, + 21600000, + 43200000, + 86400000 + ], + "dimensionFilters": [ + { + "dimension": "availabilityResult/name", + "filter": "source-dot-net", + "operator": "eq" + } + ], + "metricDefinition": "Microsoft.Insights/components", + "metricName": "availabilityResults/availabilityPercentage", + "metricNamespace": "microsoft.insights/components", + "resources": [ + { + "metricNamespace": "microsoft.insights/components", + "resourceGroup": "dotnet-eng-cluster", + "resourceName": "dotnet-eng", + "subscription": "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f" + } + ], + "timeGrain": "auto", + "top": "10" + }, + "datasource": { + "type": "grafana-azure-monitor-datasource", + "uid": "F2XodEi7z" + }, + "queryType": "Azure Monitor", + "refId": "A", + "subscription": "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", + "subscriptions": [ + "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", + "cab65fc3-d077-467d-931f-3932eabf36d3" + ] + } + }, + { + "refId": "B", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "A", + "5m", + "now" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "A", + "reducer": "mean", + "refId": "B", + "type": "reduce" + }, + "relativeTimeRange": { + "from": 300, + "to": 0 + } + }, + { + "refId": "C", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [ + 60 + ], + "type": "lt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "B" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "B", + "refId": "C", + "type": "threshold" + } + } + ], + "noDataState": "KeepLast", + "execErrState": "KeepLast", + "for": "15m", + "frequency": "1m", + "annotations": { + "description": "source.dot.net availability is low!" + }, + "labels": { + "NotificationId": "fb8faaf7600740f98a1c2db076cd1712" + }, + "folderUID": "arcade-services", + "ruleGroup": "Source Browser Alerts", + "intervalSeconds": 900000, + "isPaused": false, + "notification_settings": { + "receiver": ".NET Status Alert", + "group_by": [ + "alertname" + ], + "group_wait": "5m", + "repeat_interval": "4h" + } +} diff --git a/src/Monitoring/Sdk/DeployPublisher.cs b/src/Monitoring/Sdk/DeployPublisher.cs index ed0fb7cc1..a144da730 100644 --- a/src/Monitoring/Sdk/DeployPublisher.cs +++ b/src/Monitoring/Sdk/DeployPublisher.cs @@ -52,6 +52,7 @@ public DeployPublisher( private string EnvironmentDatasourceDirectory => Path.Combine(DatasourceDirectory, _environment); private string EnvironmentNotificationDirectory => Path.Combine(NotificationDirectory, _environment); + private string AlertRuleDirectory => Path.Combine(Path.GetDirectoryName(NotificationDirectory), "alertrules", _environment); public void Dispose() { @@ -65,6 +66,9 @@ public async Task PostToGrafanaAsync() // Post contact points for unified alerting (Azure Managed Grafana) await PostContactPointsAsync().ConfigureAwait(false); + // Post alert rules for unified alerting (Azure Managed Grafana) + await PostAlertRulesAsync().ConfigureAwait(false); + await PostDashboardsAsync().ConfigureAwait(false); } @@ -145,6 +149,62 @@ private async Task PostContactPointsAsync() } } + private async Task PostAlertRulesAsync() + { + // Check if alert rules directory exists (optional feature) + if (!Directory.Exists(AlertRuleDirectory)) + { + Log.LogMessage(MessageImportance.Low, "No alert rules directory found at {0}, skipping alert rules", AlertRuleDirectory); + return; + } + + Log.LogMessage(MessageImportance.High, "Loading parameters from: {0}", Path.GetFullPath(_parameterFile)); + Log.LogMessage(MessageImportance.High, "Parameters file exists: {0}", File.Exists(_parameterFile)); + + // Load parameters for deparameterization + List parameters; + using (StreamReader sr = new StreamReader(_parameterFile)) + using (JsonReader jr = new JsonTextReader(sr)) + { + JsonSerializer jsonSerializer = new JsonSerializer(); + parameters = jsonSerializer.Deserialize>(jr); + } + + if (parameters == null || parameters.Count == 0) + { + Log.LogError("Failed to load parameters from {0}", _parameterFile); + return; + } + + Log.LogMessage(MessageImportance.High, "Loaded {0} parameters from {1}", parameters.Count, _parameterFile); + + foreach (string alertRulePath in Directory.GetFiles(AlertRuleDirectory, + "*" + AlertRuleExtension, + SearchOption.AllDirectories)) + { + JObject data; + using (var sr = new StreamReader(alertRulePath)) + using (var jr = new JsonTextReader(sr)) + { + data = await JObject.LoadAsync(jr).ConfigureAwait(false); + } + + string uid = data.Value("uid"); + string title = data.Value("title"); + Log.LogMessage(MessageImportance.Normal, "Posting alert rule {0} ({1})...", uid, title); + + // Replace [parameter(...)] placeholders with environment-specific values + data = GrafanaSerialization.DeparameterizeDashboard(data, parameters, _environment); + + // Log the final JSON for debugging + Log.LogMessage(MessageImportance.High, "Alert JSON after parameter replacement: {0}", data.ToString(Formatting.Indented)); + + await ReplaceVaultAsync(data); + + await GrafanaClient.CreateAlertRuleAsync(data).ConfigureAwait(false); + } + } + private async Task PostDashboardsAsync() { JArray folderArray = await GrafanaClient.ListFoldersAsync().ConfigureAwait(false); diff --git a/src/Monitoring/Sdk/DeployToolBase.cs b/src/Monitoring/Sdk/DeployToolBase.cs index fa525f9ad..7ad56be66 100644 --- a/src/Monitoring/Sdk/DeployToolBase.cs +++ b/src/Monitoring/Sdk/DeployToolBase.cs @@ -12,6 +12,7 @@ public abstract class DeployToolBase protected const string DashboardExtension = ".dashboard.json"; protected const string DatasourceExtension = ".datasource.json"; protected const string NotificationExtension = ".notification.json"; + protected const string AlertRuleExtension = ".alert.json"; protected const string BaseUidTagPrefix = "baseuid:"; protected const string SourceTagPrefix = "source:"; @@ -82,4 +83,9 @@ protected static string GetNameFromDatasourceFile(string fileName) { return fileName.Substring(0, fileName.Length - DatasourceExtension.Length); } + + protected static string GetUidFromAlertRuleFile(string fileName) + { + return fileName.Substring(0, fileName.Length - AlertRuleExtension.Length); + } } diff --git a/src/Monitoring/Sdk/GrafanaClient.cs b/src/Monitoring/Sdk/GrafanaClient.cs index 0edab3df3..eea821e7e 100644 --- a/src/Monitoring/Sdk/GrafanaClient.cs +++ b/src/Monitoring/Sdk/GrafanaClient.cs @@ -364,6 +364,46 @@ public async Task GetContactPointAsync(string name) } } + public async Task CreateAlertRuleAsync(JObject alertRule) + { + string uid = alertRule.Value("uid"); + + // Check if alert rule already exists + var getUri = new Uri(new Uri(_baseUrl), $"/api/v1/provisioning/alert-rules/{Uri.EscapeDataString(uid)}"); + + using (HttpResponseMessage existCheck = await _client.GetAsync(getUri).ConfigureAwait(false)) + { + if (existCheck.StatusCode == HttpStatusCode.NotFound) + { + // Create new alert rule + var createUri = new Uri(new Uri(_baseUrl), "/api/v1/provisioning/alert-rules"); + await SendObjectAsync(alertRule, createUri, HttpMethod.Post).ConfigureAwait(false); + } + else + { + // Update existing alert rule + await existCheck.EnsureSuccessWithContentAsync(); + + // Get existing version and provenance + using (Stream stream = await existCheck.Content.ReadAsStreamAsync().ConfigureAwait(false)) + using (var streamReader = new StreamReader(stream)) + using (var jsonReader = new JsonTextReader(streamReader)) + { + JObject existing = await JObject.LoadAsync(jsonReader).ConfigureAwait(false); + + // Preserve id and updated timestamp + if (existing.TryGetValue("id", out JToken idToken)) + alertRule["id"] = idToken; + if (existing.TryGetValue("updated", out JToken updatedToken)) + alertRule["updated"] = updatedToken; + } + + var updateUri = new Uri(new Uri(_baseUrl), $"/api/v1/provisioning/alert-rules/{Uri.EscapeDataString(uid)}"); + await SendObjectAsync(alertRule, updateUri, HttpMethod.Put).ConfigureAwait(false); + } + } + } + public void Dispose() { _client?.Dispose(); From 69a270927cbb5d0fd74b55bca6de61fdaa8042d9 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Tue, 18 Nov 2025 17:31:17 -0800 Subject: [PATCH 086/133] add quota alerts --- .../Production/cores-consumption.alert.json | 114 ++++++++++++++++ .../Production/quota-eastus.alert.json | 106 +++++++++++++++ .../Production/quota-westus.alert.json | 108 +++++++++++++++ .../Production/quota-westus2.alert.json | 108 +++++++++++++++ .../Staging/cores-consumption.alert.json | 114 ++++++++++++++++ .../Staging/quota-eastus.alert.json | 127 ++++++++++++++++++ .../Staging/quota-westus.alert.json | 108 +++++++++++++++ .../Staging/quota-westus2.alert.json | 108 +++++++++++++++ 8 files changed, 893 insertions(+) create mode 100644 src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/cores-consumption.alert.json create mode 100644 src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/quota-eastus.alert.json create mode 100644 src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/quota-westus.alert.json create mode 100644 src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/quota-westus2.alert.json create mode 100644 src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/cores-consumption.alert.json create mode 100644 src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/quota-eastus.alert.json create mode 100644 src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/quota-westus.alert.json create mode 100644 src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/quota-westus2.alert.json diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/cores-consumption.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/cores-consumption.alert.json new file mode 100644 index 000000000..cfb495d83 --- /dev/null +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/cores-consumption.alert.json @@ -0,0 +1,114 @@ +{ + "uid": "cores-consumption", + "title": "Cores consumption", + "condition": "C", + "data": [ + { + "refId": "A", + "queryType": "Azure Log Analytics", + "datasourceUid": "F2XodEi7z", + "relativeTimeRange": { + "from": 21600, + "to": 0 + }, + "model": { + "azureLogAnalytics": { + "query": "let quotaPerSubscription = customEvents \n| where $__timeFilter(timestamp)\n| where name == \"AzureSubscriptionQuotaLimit\"\n| project \n quota = toint(customMeasurements.quota),\n subscription = tostring(customDimensions.subscriptionId),\n timestamp\n| summarize arg_max(timestamp, quota) by subscription\n| project quota, subscription;\ncustomEvents\n| where $__timeFilter(timestamp)\n| where name == 'AzureResourceUsage'\n| where customDimensions.name == \"standardDv3Family\" or customDimensions.name == \"standardDAv4Family\"\n| project \n cores = toreal(customMeasurements.current),\n subscription = tostring(customDimensions.subscription),\n timestamp\n| join kind=inner quotaPerSubscription on subscription\n| project ['limit'] = quota, cores, timestamp, subscription\n| extend ['percent of limit'] = iff(['limit'] == 0, 0.0, cores/['limit'] * 100)\n| summarize ['percent of limit'] = max(['percent of limit']) by bin(timestamp, $__interval), subscription\n| order by timestamp asc", + "resource": "[parameter(dotnet-eng-appinsights-resourcepath)]", + "resultFormat": "time_series", + "workspace": "[parameter(default-workspace-resourcepath)]" + }, + "azureMonitor": { + "dimensionFilters": [], + "timeGrain": "auto", + "top": "10" + }, + "datasource": { + "type": "grafana-azure-monitor-datasource", + "uid": "F2XodEi7z" + }, + "queryType": "Azure Log Analytics", + "refId": "A", + "subscription": "[parameter(dotnet-eng-appinsights-subscriptionid)]", + "subscriptions": [ + "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", + "cab65fc3-d077-467d-931f-3932eabf36d3" + ] + } + }, + { + "refId": "B", + "queryType": "", + "datasourceUid": "-100", + "model": { + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "A", + "reducer": "mean", + "refId": "B", + "type": "reduce" + }, + "relativeTimeRange": { + "from": 300, + "to": 0 + } + }, + { + "refId": "C", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [ + 95 + ], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "B" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "B", + "refId": "C", + "type": "threshold" + } + } + ], + "noDataState": "KeepLast", + "execErrState": "KeepLast", + "for": "5m", + "frequency": "1m", + "annotations": { + "description": "Cores consumption by Autoscaler is above 95% of limit" + }, + "labels": { + "NotificationId": "66b2ef8da5c74a2fbbc7d6739f55e4e8" + }, + "folderUID": "arcade-services", + "ruleGroup": "Azure Quota Alerts", + "intervalMs": 900000, + "isPaused": false, + "notification_settings": { + "receiver": ".NET Status Alert", + "group_by": [ + "alertname" + ], + "group_wait": "5m", + "repeat_interval": "4h" + } +} \ No newline at end of file diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/quota-eastus.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/quota-eastus.alert.json new file mode 100644 index 000000000..383921ae5 --- /dev/null +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/quota-eastus.alert.json @@ -0,0 +1,106 @@ +{ + "uid": "quota-eastus", + "title": "Azure quota usage for east us", + "condition": "C", + "data": [ + { + "refId": "A", + "queryType": "Azure Log Analytics", + "datasourceUid": "F2XodEi7z", + "model": { + "azureLogAnalytics": { + "query": "customEvents\n| where $__timeFilter(timestamp)\n| where name == 'AzureResourceUsage'\n| project\n timestamp,\n resource = tostring(customDimensions.name),\n location = tostring(customDimensions.location),\n current = toreal(customMeasurements.current),\n ['limit'] = toreal(customMeasurements.['limit']),\n subscription = tostring(customDimensions.subscription)\n| where resource != 'NetworkWatchers'\n| where location == 'eastus'\n| where subscription in (\"cab65fc3-d077-467d-931f-3932eabf36d3\", \"68672ab8-de0c-40f1-8d1b-ffb20bd62c0f\")\n| extend ['percent of limit'] = iff(['limit'] == 0, 0.0, current/['limit'] * 100)\n| summarize ['percent of limit'] = max(['percent of limit']) by bin(timestamp, $__interval), resource\n| order by timestamp asc, resource", + "resource": "[parameter(dotnet-eng-appinsights-resourcepath)]", + "resultFormat": "time_series", + "workspace": "[parameter(default-workspace-resourcepath)]" + }, + "azureMonitor": { + "dimensionFilters": [], + "timeGrain": "auto" + }, + "datasource": { + "type": "grafana-azure-monitor-datasource", + "uid": "F2XodEi7z" + }, + "queryType": "Azure Log Analytics", + "refId": "A", + "subscription": "[parameter(dotnet-eng-appinsights-subscriptionid)]", + "subscriptions": [ + "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", + "cab65fc3-d077-467d-931f-3932eabf36d3" + ] + }, + "relativeTimeRange": { + "from": 300, + "to": 0 + } + }, + { + "refId": "B", + "queryType": "", + "datasourceUid": "-100", + "model": { + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "A", + "reducer": "mean", + "refId": "B", + "type": "reduce" + }, + "relativeTimeRange": { + "from": 300, + "to": 0 + } + }, + { + "refId": "C", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [95], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": ["C"] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "B", + "refId": "C", + "type": "threshold" + } + } + ], + "noDataState": "KeepLast", + "execErrState": "KeepLast", + "for": "5m", + "annotations": { + "description": "An Azure Resource Quota is nearing its limit in region eastus!" + }, + "labels": { + "NotificationId": "b50b57fa7d1840438da5232711af4485" + }, + "folderUID": "arcade-services", + "ruleGroup": "Azure Quota Alerts", + "intervalSeconds": 60, + "isPaused": false, + "notification_settings": { + "receiver": ".NET Status Alert", + "group_by": ["alertname"], + "group_wait": "5m", + "repeat_interval": "4h" + } +} diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/quota-westus.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/quota-westus.alert.json new file mode 100644 index 000000000..1b9aab107 --- /dev/null +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/quota-westus.alert.json @@ -0,0 +1,108 @@ +{ + "uid": "quota-westus", + "title": "Azure quota usage for west us", + "condition": "C", + "data": [ + { + "refId": "A", + "queryType": "Azure Log Analytics", + "datasourceUid": "F2XodEi7z", + "model": { + "azureLogAnalytics": { + "query": "customEvents\n| where $__timeFilter(timestamp)\n| where name == 'AzureResourceUsage'\n| project\n timestamp,\n resource = tostring(customDimensions.name),\n location = tostring(customDimensions.location),\n current = toreal(customMeasurements.current),\n ['limit'] = toreal(customMeasurements.['limit']),\n subscription = tostring(customDimensions.subscription)\n| where resource != 'NetworkWatchers'\n| where location == 'westus'\n| where subscription in (\"cab65fc3-d077-467d-931f-3932eabf36d3\", \"68672ab8-de0c-40f1-8d1b-ffb20bd62c0f\")\n| extend ['percent of limit'] = iff(['limit'] == 0, 0.0, current/['limit'] * 100)\n| summarize ['percent of limit'] = max(['percent of limit']) by bin(timestamp, $__interval), resource\n| order by timestamp asc, resource", + "resource": "[parameter(dotnet-eng-appinsights-resourcepath)]", + "resultFormat": "time_series", + "workspace": "[parameter(default-workspace-resourcepath)]" + }, + "azureMonitor": { + "dimensionFilters": [], + "timeGrain": "auto", + "top": "10" + }, + "datasource": { + "type": "grafana-azure-monitor-datasource", + "uid": "F2XodEi7z" + }, + "queryType": "Azure Log Analytics", + "refId": "A", + "subscription": "[parameter(dotnet-eng-appinsights-subscriptionid)]", + "subscriptions": [ + "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", + "cab65fc3-d077-467d-931f-3932eabf36d3" + ] + }, + "relativeTimeRange": { + "from": 86400, + "to": 0 + } + }, + { + "refId": "B", + "queryType": "", + "datasourceUid": "-100", + "model": { + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "A", + "reducer": "mean", + "refId": "B", + "type": "reduce" + }, + "relativeTimeRange": { + "from": 300, + "to": 0 + } + }, + { + "refId": "C", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [95], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": ["B"] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "B", + "refId": "C", + "type": "threshold" + } + } + ], + "noDataState": "KeepLast", + "execErrState": "KeepLast", + "for": "5m", + "frequency": "1m", + "annotations": { + "description": "An Azure Resource Quota is nearing its limit in region westus!" + }, + "labels": { + "NotificationId": "e2be2ec3e22e46d28730bab54ff8fa77" + }, + "folderUID": "arcade-services", + "ruleGroup": "Azure Quota Alerts", + "intervalMs": 900000, + "isPaused": false, + "notification_settings": { + "receiver": ".NET Status Alert", + "group_by": ["alertname"], + "group_wait": "5m", + "repeat_interval": "4h" + } +} diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/quota-westus2.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/quota-westus2.alert.json new file mode 100644 index 000000000..08da0b4f5 --- /dev/null +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/quota-westus2.alert.json @@ -0,0 +1,108 @@ +{ + "uid": "quota-westus2", + "title": "Azure quota usage for west us 2", + "condition": "C", + "data": [ + { + "refId": "A", + "queryType": "Azure Log Analytics", + "datasourceUid": "F2XodEi7z", + "model": { + "azureLogAnalytics": { + "query": "customEvents\n| where $__timeFilter(timestamp)\n| where name == 'AzureResourceUsage'\n| project\n timestamp,\n resource = tostring(customDimensions.name),\n location = tostring(customDimensions.location),\n current = toreal(customMeasurements.current),\n ['limit'] = toreal(customMeasurements.['limit']),\n subscription = tostring(customDimensions.subscription)\n| where resource != 'NetworkWatchers' and resource != \"standardDASv4Family\"\n| where location == 'westus2'\n| where subscription in (\"cab65fc3-d077-467d-931f-3932eabf36d3\", \"68672ab8-de0c-40f1-8d1b-ffb20bd62c0f\")\n| extend ['percent of limit'] = iff(['limit'] == 0, 0.0, current/['limit'] * 100)\n| summarize ['percent of limit'] = max(['percent of limit']) by bin(timestamp, $__interval), resource\n| order by timestamp asc, resource", + "resource": "[parameter(dotnet-eng-appinsights-resourcepath)]", + "resultFormat": "time_series", + "workspace": "[parameter(default-workspace-resourcepath)]" + }, + "azureMonitor": { + "dimensionFilters": [], + "timeGrain": "auto", + "top": "10" + }, + "datasource": { + "type": "grafana-azure-monitor-datasource", + "uid": "F2XodEi7z" + }, + "queryType": "Azure Log Analytics", + "refId": "A", + "subscription": "[parameter(dotnet-eng-appinsights-subscriptionid)]", + "subscriptions": [ + "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", + "cab65fc3-d077-467d-931f-3932eabf36d3" + ] + }, + "relativeTimeRange": { + "from": 86400, + "to": 0 + } + }, + { + "refId": "B", + "queryType": "", + "datasourceUid": "-100", + "model": { + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "A", + "reducer": "mean", + "refId": "B", + "type": "reduce" + }, + "relativeTimeRange": { + "from": 600, + "to": 0 + } + }, + { + "refId": "C", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [95], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": ["B"] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "B", + "refId": "C", + "type": "threshold" + } + } + ], + "noDataState": "KeepLast", + "execErrState": "KeepLast", + "for": "5m", + "frequency": "1m", + "annotations": { + "description": "An Azure Resource Quota is nearing its limit in region westus2!" + }, + "labels": { + "NotificationId": "44aff3c937c042caa09f821ae923c26c" + }, + "folderUID": "arcade-services", + "ruleGroup": "Azure Quota Alerts", + "intervalMs": 900000, + "isPaused": false, + "notification_settings": { + "receiver": ".NET Status Alert", + "group_by": ["alertname"], + "group_wait": "5m", + "repeat_interval": "4h" + } +} diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/cores-consumption.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/cores-consumption.alert.json new file mode 100644 index 000000000..cfb495d83 --- /dev/null +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/cores-consumption.alert.json @@ -0,0 +1,114 @@ +{ + "uid": "cores-consumption", + "title": "Cores consumption", + "condition": "C", + "data": [ + { + "refId": "A", + "queryType": "Azure Log Analytics", + "datasourceUid": "F2XodEi7z", + "relativeTimeRange": { + "from": 21600, + "to": 0 + }, + "model": { + "azureLogAnalytics": { + "query": "let quotaPerSubscription = customEvents \n| where $__timeFilter(timestamp)\n| where name == \"AzureSubscriptionQuotaLimit\"\n| project \n quota = toint(customMeasurements.quota),\n subscription = tostring(customDimensions.subscriptionId),\n timestamp\n| summarize arg_max(timestamp, quota) by subscription\n| project quota, subscription;\ncustomEvents\n| where $__timeFilter(timestamp)\n| where name == 'AzureResourceUsage'\n| where customDimensions.name == \"standardDv3Family\" or customDimensions.name == \"standardDAv4Family\"\n| project \n cores = toreal(customMeasurements.current),\n subscription = tostring(customDimensions.subscription),\n timestamp\n| join kind=inner quotaPerSubscription on subscription\n| project ['limit'] = quota, cores, timestamp, subscription\n| extend ['percent of limit'] = iff(['limit'] == 0, 0.0, cores/['limit'] * 100)\n| summarize ['percent of limit'] = max(['percent of limit']) by bin(timestamp, $__interval), subscription\n| order by timestamp asc", + "resource": "[parameter(dotnet-eng-appinsights-resourcepath)]", + "resultFormat": "time_series", + "workspace": "[parameter(default-workspace-resourcepath)]" + }, + "azureMonitor": { + "dimensionFilters": [], + "timeGrain": "auto", + "top": "10" + }, + "datasource": { + "type": "grafana-azure-monitor-datasource", + "uid": "F2XodEi7z" + }, + "queryType": "Azure Log Analytics", + "refId": "A", + "subscription": "[parameter(dotnet-eng-appinsights-subscriptionid)]", + "subscriptions": [ + "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", + "cab65fc3-d077-467d-931f-3932eabf36d3" + ] + } + }, + { + "refId": "B", + "queryType": "", + "datasourceUid": "-100", + "model": { + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "A", + "reducer": "mean", + "refId": "B", + "type": "reduce" + }, + "relativeTimeRange": { + "from": 300, + "to": 0 + } + }, + { + "refId": "C", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [ + 95 + ], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "B" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "B", + "refId": "C", + "type": "threshold" + } + } + ], + "noDataState": "KeepLast", + "execErrState": "KeepLast", + "for": "5m", + "frequency": "1m", + "annotations": { + "description": "Cores consumption by Autoscaler is above 95% of limit" + }, + "labels": { + "NotificationId": "66b2ef8da5c74a2fbbc7d6739f55e4e8" + }, + "folderUID": "arcade-services", + "ruleGroup": "Azure Quota Alerts", + "intervalMs": 900000, + "isPaused": false, + "notification_settings": { + "receiver": ".NET Status Alert", + "group_by": [ + "alertname" + ], + "group_wait": "5m", + "repeat_interval": "4h" + } +} \ No newline at end of file diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/quota-eastus.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/quota-eastus.alert.json new file mode 100644 index 000000000..752a9644f --- /dev/null +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/quota-eastus.alert.json @@ -0,0 +1,127 @@ +{ + "uid": "quota-eastus", + "title": "Azure quota usage for east us", + "condition": "C", + "data": [ + { + "refId": "A", + "queryType": "Azure Log Analytics", + "datasourceUid": "F2XodEi7z", + "model": { + "azureLogAnalytics": { + "query": "customEvents\n| where $__timeFilter(timestamp)\n| where name == 'AzureResourceUsage'\n| project\n timestamp,\n resource = tostring(customDimensions.name),\n location = tostring(customDimensions.location),\n current = toreal(customMeasurements.current),\n ['limit'] = toreal(customMeasurements.['limit']),\n subscription = tostring(customDimensions.subscription)\n| where resource != 'NetworkWatchers'\n| where location == 'eastus'\n| where subscription in (\"cab65fc3-d077-467d-931f-3932eabf36d3\", \"68672ab8-de0c-40f1-8d1b-ffb20bd62c0f\")\n| extend ['percent of limit'] = iff(['limit'] == 0, 0.0, current/['limit'] * 100)\n| summarize ['percent of limit'] = max(['percent of limit']) by bin(timestamp, $__interval), resource\n| order by timestamp asc, resource", + "resource": "[parameter(dotnet-eng-appinsights-resourcepath)]", + "resultFormat": "time_series", + "workspace": "[parameter(default-workspace-resourcepath)]" + }, + "azureMonitor": { + "dimensionFilters": [], + "timeGrain": "auto", + "top": "10" + }, + "datasource": { + "type": "grafana-azure-monitor-datasource", + "uid": "F2XodEi7z" + }, + "queryType": "Azure Log Analytics", + "refId": "A", + "subscription": "[parameter(dotnet-eng-appinsights-subscriptionid)]", + "subscriptions": [ + "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", + "cab65fc3-d077-467d-931f-3932eabf36d3" + ] + }, + "relativeTimeRange": { + "from": 86400, + "to": 0 + } + }, + { + "refId": "B", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "A", + "30m", + "now" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "A", + "reducer": "mean", + "refId": "B", + "type": "reduce" + }, + "relativeTimeRange": { + "from": 300, + "to": 0 + } + }, + { + "refId": "C", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [95], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": ["B"] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "B", + "refId": "C", + "type": "threshold" + } + } + ], + "noDataState": "KeepLast", + "execErrState": "KeepLast", + "for": "5m", + "frequency": "1m", + "annotations": { + "description": "An Azure Resource Quota is nearing its limit in region eastus!" + }, + "labels": { + "NotificationId": "b50b57fa7d1840438da5232711af4485" + }, + "folderUID": "arcade-services", + "ruleGroup": "Azure Quota Alerts", + "intervalMs": 900000, + "isPaused": false, + "notification_settings": { + "receiver": ".NET Status Alert", + "group_by": ["alertname"], + "group_wait": "5m", + "repeat_interval": "4h" + } +} diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/quota-westus.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/quota-westus.alert.json new file mode 100644 index 000000000..1b9aab107 --- /dev/null +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/quota-westus.alert.json @@ -0,0 +1,108 @@ +{ + "uid": "quota-westus", + "title": "Azure quota usage for west us", + "condition": "C", + "data": [ + { + "refId": "A", + "queryType": "Azure Log Analytics", + "datasourceUid": "F2XodEi7z", + "model": { + "azureLogAnalytics": { + "query": "customEvents\n| where $__timeFilter(timestamp)\n| where name == 'AzureResourceUsage'\n| project\n timestamp,\n resource = tostring(customDimensions.name),\n location = tostring(customDimensions.location),\n current = toreal(customMeasurements.current),\n ['limit'] = toreal(customMeasurements.['limit']),\n subscription = tostring(customDimensions.subscription)\n| where resource != 'NetworkWatchers'\n| where location == 'westus'\n| where subscription in (\"cab65fc3-d077-467d-931f-3932eabf36d3\", \"68672ab8-de0c-40f1-8d1b-ffb20bd62c0f\")\n| extend ['percent of limit'] = iff(['limit'] == 0, 0.0, current/['limit'] * 100)\n| summarize ['percent of limit'] = max(['percent of limit']) by bin(timestamp, $__interval), resource\n| order by timestamp asc, resource", + "resource": "[parameter(dotnet-eng-appinsights-resourcepath)]", + "resultFormat": "time_series", + "workspace": "[parameter(default-workspace-resourcepath)]" + }, + "azureMonitor": { + "dimensionFilters": [], + "timeGrain": "auto", + "top": "10" + }, + "datasource": { + "type": "grafana-azure-monitor-datasource", + "uid": "F2XodEi7z" + }, + "queryType": "Azure Log Analytics", + "refId": "A", + "subscription": "[parameter(dotnet-eng-appinsights-subscriptionid)]", + "subscriptions": [ + "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", + "cab65fc3-d077-467d-931f-3932eabf36d3" + ] + }, + "relativeTimeRange": { + "from": 86400, + "to": 0 + } + }, + { + "refId": "B", + "queryType": "", + "datasourceUid": "-100", + "model": { + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "A", + "reducer": "mean", + "refId": "B", + "type": "reduce" + }, + "relativeTimeRange": { + "from": 300, + "to": 0 + } + }, + { + "refId": "C", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [95], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": ["B"] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "B", + "refId": "C", + "type": "threshold" + } + } + ], + "noDataState": "KeepLast", + "execErrState": "KeepLast", + "for": "5m", + "frequency": "1m", + "annotations": { + "description": "An Azure Resource Quota is nearing its limit in region westus!" + }, + "labels": { + "NotificationId": "e2be2ec3e22e46d28730bab54ff8fa77" + }, + "folderUID": "arcade-services", + "ruleGroup": "Azure Quota Alerts", + "intervalMs": 900000, + "isPaused": false, + "notification_settings": { + "receiver": ".NET Status Alert", + "group_by": ["alertname"], + "group_wait": "5m", + "repeat_interval": "4h" + } +} diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/quota-westus2.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/quota-westus2.alert.json new file mode 100644 index 000000000..08da0b4f5 --- /dev/null +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/quota-westus2.alert.json @@ -0,0 +1,108 @@ +{ + "uid": "quota-westus2", + "title": "Azure quota usage for west us 2", + "condition": "C", + "data": [ + { + "refId": "A", + "queryType": "Azure Log Analytics", + "datasourceUid": "F2XodEi7z", + "model": { + "azureLogAnalytics": { + "query": "customEvents\n| where $__timeFilter(timestamp)\n| where name == 'AzureResourceUsage'\n| project\n timestamp,\n resource = tostring(customDimensions.name),\n location = tostring(customDimensions.location),\n current = toreal(customMeasurements.current),\n ['limit'] = toreal(customMeasurements.['limit']),\n subscription = tostring(customDimensions.subscription)\n| where resource != 'NetworkWatchers' and resource != \"standardDASv4Family\"\n| where location == 'westus2'\n| where subscription in (\"cab65fc3-d077-467d-931f-3932eabf36d3\", \"68672ab8-de0c-40f1-8d1b-ffb20bd62c0f\")\n| extend ['percent of limit'] = iff(['limit'] == 0, 0.0, current/['limit'] * 100)\n| summarize ['percent of limit'] = max(['percent of limit']) by bin(timestamp, $__interval), resource\n| order by timestamp asc, resource", + "resource": "[parameter(dotnet-eng-appinsights-resourcepath)]", + "resultFormat": "time_series", + "workspace": "[parameter(default-workspace-resourcepath)]" + }, + "azureMonitor": { + "dimensionFilters": [], + "timeGrain": "auto", + "top": "10" + }, + "datasource": { + "type": "grafana-azure-monitor-datasource", + "uid": "F2XodEi7z" + }, + "queryType": "Azure Log Analytics", + "refId": "A", + "subscription": "[parameter(dotnet-eng-appinsights-subscriptionid)]", + "subscriptions": [ + "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", + "cab65fc3-d077-467d-931f-3932eabf36d3" + ] + }, + "relativeTimeRange": { + "from": 86400, + "to": 0 + } + }, + { + "refId": "B", + "queryType": "", + "datasourceUid": "-100", + "model": { + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "A", + "reducer": "mean", + "refId": "B", + "type": "reduce" + }, + "relativeTimeRange": { + "from": 600, + "to": 0 + } + }, + { + "refId": "C", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [95], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": ["B"] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "B", + "refId": "C", + "type": "threshold" + } + } + ], + "noDataState": "KeepLast", + "execErrState": "KeepLast", + "for": "5m", + "frequency": "1m", + "annotations": { + "description": "An Azure Resource Quota is nearing its limit in region westus2!" + }, + "labels": { + "NotificationId": "44aff3c937c042caa09f821ae923c26c" + }, + "folderUID": "arcade-services", + "ruleGroup": "Azure Quota Alerts", + "intervalMs": 900000, + "isPaused": false, + "notification_settings": { + "receiver": ".NET Status Alert", + "group_by": ["alertname"], + "group_wait": "5m", + "repeat_interval": "4h" + } +} From d09e4e6ebcfa7e3b552c5a8e9a7d9c28f72f39b4 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Wed, 19 Nov 2025 10:01:02 -0800 Subject: [PATCH 087/133] import secret to staging and prod KV --- .vault-config/dnceng-amg-int-kv.yaml | 25 +++++-------------------- .vault-config/dnceng-amg-prod-kv.yaml | 11 +++++++++++ 2 files changed, 16 insertions(+), 20 deletions(-) create mode 100644 .vault-config/dnceng-amg-prod-kv.yaml diff --git a/.vault-config/dnceng-amg-int-kv.yaml b/.vault-config/dnceng-amg-int-kv.yaml index 68a0c2697..34cbf9b3b 100644 --- a/.vault-config/dnceng-amg-int-kv.yaml +++ b/.vault-config/dnceng-amg-int-kv.yaml @@ -4,23 +4,8 @@ storageLocation: subscription: a4fc5514-21a9-4296-bfaf-5c7ee7fa35d1 name: dnceng-amg-int-kv -secrets: - # Copy only the secrets needed for Azure Managed Grafana datasources and notifications - - # API token for DotNet Status website - dotnet-build-bot-dotnet-eng-status-token: - type: text - parameters: - description: API token from https://dotneteng-status{-staging}.azurewebsites.net/ - Generated using dotnet-build-bot account - - # Authorization header for Deployment Annotations datasource - dotneteng-status-auth-header: - type: text - parameters: - description: "Bearer token for status API - Format: Bearer " - - # Teams webhook URL for alert notifications - fr-bot-notifications-teams-notification-url: - type: text - parameters: - description: Teams Incoming Webhook URL - Do not rotate +import: + - shared/dotnet-grafana-secrets.yaml: + - dotnet-build-bot-dotnet-eng-status-token + - dotneteng-status-auth-header + - fr-bot-notifications-teams-notification-url diff --git a/.vault-config/dnceng-amg-prod-kv.yaml b/.vault-config/dnceng-amg-prod-kv.yaml new file mode 100644 index 000000000..34cbf9b3b --- /dev/null +++ b/.vault-config/dnceng-amg-prod-kv.yaml @@ -0,0 +1,11 @@ +storageLocation: + type: azure-key-vault + parameters: + subscription: a4fc5514-21a9-4296-bfaf-5c7ee7fa35d1 + name: dnceng-amg-int-kv + +import: + - shared/dotnet-grafana-secrets.yaml: + - dotnet-build-bot-dotnet-eng-status-token + - dotneteng-status-auth-header + - fr-bot-notifications-teams-notification-url From 7b74036b6d4489c2309da8e3ca913ae52cd6f419 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Wed, 19 Nov 2025 13:28:47 -0800 Subject: [PATCH 088/133] remove unused output variables --- eng/deployment/azure-appgw-grafana.bicep | 43 +++++++++++++++++++----- 1 file changed, 34 insertions(+), 9 deletions(-) diff --git a/eng/deployment/azure-appgw-grafana.bicep b/eng/deployment/azure-appgw-grafana.bicep index 65816e1fe..32004caf2 100644 --- a/eng/deployment/azure-appgw-grafana.bicep +++ b/eng/deployment/azure-appgw-grafana.bicep @@ -13,14 +13,14 @@ param grafanaEndpoint string 'Standard_v2' 'WAF_v2' ]) -param skuName string = 'Standard_v2' +param skuName string = 'WAF_v2' @description('The SKU tier for Application Gateway') @allowed([ 'Standard_v2' 'WAF_v2' ]) -param skuTier string = 'Standard_v2' +param skuTier string = 'WAF_v2' @description('The capacity (instance count) for Application Gateway') @minValue(1) @@ -50,6 +50,7 @@ var customDomainName = '${publicDnsLabel}.${regionShortName}.cloudapp.azure.com' var appGwName = environment == 'Production' ? 'dnceng-grafana-appgw' : 'dnceng-grafana-staging-appgw' var publicIpName = environment == 'Production' ? 'dnceng-grafana-pip' : 'dnceng-grafana-staging-pip' var vnetName = environment == 'Production' ? 'dnceng-grafana-vnet' : 'dnceng-grafana-staging-vnet' +var wafPolicyName = environment == 'Production' ? 'dnceng-grafana-waf-policy' : 'dnceng-grafana-staging-waf-policy' var subnetName = 'appgw-subnet' var backendPoolName = 'grafana-backend-pool' var frontendPortName = 'https-port' @@ -60,6 +61,34 @@ var ruleName = 'https-routing-rule' var probeName = 'grafana-health-probe' var sslCertificateName = 'appgw-ssl-cert' +// WAF Policy +resource wafPolicy 'Microsoft.Network/ApplicationGatewayWebApplicationFirewallPolicies@2023-05-01' = { + name: wafPolicyName + location: location + tags: resourceTags + properties: { + policySettings: { + requestBodyCheck: true + maxRequestBodySizeInKb: 128 + fileUploadLimitInMb: 100 + state: 'Enabled' + mode: 'Prevention' + } + managedRules: { + managedRuleSets: [ + { + ruleSetType: 'OWASP' + ruleSetVersion: '3.2' + } + { + ruleSetType: 'Microsoft_BotManagerRuleSet' + ruleSetVersion: '1.0' + } + ] + } + } +} + // Virtual Network for Application Gateway resource vnet 'Microsoft.Network/virtualNetworks@2023-05-01' = { name: vnetName @@ -120,6 +149,9 @@ resource applicationGateway 'Microsoft.Network/applicationGateways@2023-05-01' = tier: skuTier capacity: capacity } + firewallPolicy: { + id: wafPolicy.id + } gatewayIPConfigurations: [ { name: 'appgw-ip-config' @@ -243,16 +275,9 @@ resource applicationGateway 'Microsoft.Network/applicationGateways@2023-05-01' = } // Outputs -output applicationGatewayId string = applicationGateway.id output applicationGatewayName string = applicationGateway.name output applicationGatewayIdentity string = applicationGateway.identity.userAssignedIdentities[grafanaUserAssignedIdentityId].principalId output publicIpAddress string = publicIp.properties.ipAddress -output publicDnsLabel string = publicDnsLabel output customDomainName string = customDomainName output customDomainUrl string = 'https://${customDomainName}' -output vnetId string = vnet.id -output vnetName string = vnet.name - -// Usage instructions -output usageInstructions string = 'Access Grafana at: https://${customDomainName}' output accessUrl string = 'https://${customDomainName}' From 0fdc08602c3c0e4c6b1fa5ce6b206bb8894a7be0 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Wed, 19 Nov 2025 13:42:27 -0800 Subject: [PATCH 089/133] remove unused output variables in azure managed grafana bicep --- eng/deployment/azure-managed-grafana.bicep | 8 -------- 1 file changed, 8 deletions(-) diff --git a/eng/deployment/azure-managed-grafana.bicep b/eng/deployment/azure-managed-grafana.bicep index 55ca1ae32..f1cef2e65 100644 --- a/eng/deployment/azure-managed-grafana.bicep +++ b/eng/deployment/azure-managed-grafana.bicep @@ -192,16 +192,8 @@ resource dotnetEngServicesGrafanaAdminRole 'Microsoft.Authorization/roleAssignme } // Output the Grafana workspace details -output grafanaWorkspaceId string = grafanaWorkspace.id -output grafanaWorkspaceName string = grafanaWorkspace.name output grafanaWorkspaceUrl string = grafanaWorkspace.properties.endpoint -output grafanaPrincipalId string = grafanaUserAssignedIdentity.properties.principalId -output grafanaTenantId string = grafanaUserAssignedIdentity.properties.tenantId -output grafanaWorkspaceLocation string = grafanaWorkspace.location output grafanaUserAssignedIdentityId string = grafanaUserAssignedIdentity.id -output grafanaUserAssignedIdentityName string = grafanaUserAssignedIdentity.name // Output Key Vault details -output keyVaultId string = grafanaKeyVault.id output keyVaultName string = grafanaKeyVault.name -output keyVaultUri string = grafanaKeyVault.properties.vaultUri From e5048af478c934d2214749f34a34b105e10a3dbc Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Wed, 19 Nov 2025 14:02:07 -0800 Subject: [PATCH 090/133] remove EnableCustomDomain variable from deploy-managed-grafana.yml --- eng/deploy-managed-grafana.yml | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/eng/deploy-managed-grafana.yml b/eng/deploy-managed-grafana.yml index 44ef6e827..a96681fec 100644 --- a/eng/deploy-managed-grafana.yml +++ b/eng/deploy-managed-grafana.yml @@ -11,9 +11,6 @@ parameters: type: string - name: GrafanaKeyVault type: string -- name: EnableCustomDomain - type: boolean - default: true stages: - stage: ProvisionGrafana @@ -34,7 +31,6 @@ stages: - stage: ProvisionApplicationGateway displayName: 'Provision Application Gateway Custom Domain' dependsOn: ProvisionGrafana - condition: and(succeeded(), eq(${{ parameters.EnableCustomDomain }}, true)) variables: GrafanaIdentityId: $[ stageDependencies.ProvisionGrafana.ProvisionGrafana.outputs['ExportIdentity.GrafanaIdentityId'] ] jobs: @@ -133,7 +129,6 @@ stages: name: NetCore1ESPool-Internal demands: ImageOverride -equals 1es-windows-2022 variables: - # Allow scripts to access the System.AccessToken for Azure authentication - name: System.AccessToken value: $(System.AccessToken) steps: @@ -157,19 +152,17 @@ stages: Write-Host "Publishing Dashboards to Azure Managed Grafana" Write-Host "==========================================" Write-Host "Grafana Endpoint: $(GrafanaEndpoint)" - Write-Host "Key Vault: ${{ parameters.GrafanaKeyVault }}" Write-Host "Environment: ${{ parameters.DeploymentEnvironment }}" Write-Host "" # Get the API token from Key Vault with retry logic for RBAC propagation $tokenSecretName = "grafana-admin-api-key" Write-Host "Retrieving API token from Key Vault..." - Write-Host "ā„¹ļø Note: Azure RBAC can take 5-10 minutes to propagate to Key Vault data plane" $apiToken = $null - $maxRetries = 12 + $maxRetries = 5 $retryCount = 0 - $waitSeconds = 30 + $waitSeconds = 60 while (-not $apiToken -and $retryCount -lt $maxRetries) { try { @@ -197,9 +190,6 @@ stages: Write-Error "2. The SetupToken job failed to create the token" Write-Error "3. The pipeline service principal doesn't have Key Vault Secrets Officer role" Write-Error "" - Write-Error "Please check:" - Write-Error "- SetupToken job logs for errors" - Write-Error "- ProvisionGrafana stage logs for RBAC assignment status" exit 1 } } From edab753bfa9831afe1ec11d58e58f6ee0a3d2a1c Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Wed, 19 Nov 2025 17:36:18 -0800 Subject: [PATCH 091/133] remove unnecessary comments --- eng/generate-appgw-cert.ps1 | 35 ----------------------------------- 1 file changed, 35 deletions(-) diff --git a/eng/generate-appgw-cert.ps1 b/eng/generate-appgw-cert.ps1 index 13a5b414c..c0282c413 100644 --- a/eng/generate-appgw-cert.ps1 +++ b/eng/generate-appgw-cert.ps1 @@ -1,22 +1,4 @@ #!/usr/bin/env pwsh -<# -.SYNOPSIS - Generate or retrieve SSL certificate from Azure Key Vault for Application Gateway -.DESCRIPTION - Creates a self-signed certificate in Azure Key Vault for the cloudapp.azure.com custom domain. - If the certificate already exists, it retrieves the secret URI. - Application Gateway references the certificate directly from Key Vault via managed identity. -.PARAMETER DnsName - The DNS name for the certificate (e.g., dnceng-managed-grafana-staging.westus2.cloudapp.azure.com) -.PARAMETER KeyVaultName - The name of the Azure Key Vault to store the certificate -.PARAMETER CertificateName - The name of the certificate in Key Vault (default: appgw-ssl-cert) -.PARAMETER ResourceGroupName - The resource group name for the Key Vault -.EXAMPLE - .\generate-appgw-cert.ps1 -DnsName "dnceng-managed-grafana-staging.westus2.cloudapp.azure.com" -KeyVaultName "dnceng-kv" -ResourceGroupName "monitoring-managed" -#> param( [Parameter(Mandatory = $true)] @@ -177,23 +159,6 @@ Write-Host "Issuer: $issuer" -ForegroundColor White Write-Host "Expires: $expiryDate" -ForegroundColor White Write-Host "" -Write-Host "================================================" -ForegroundColor Cyan -Write-Host "Next Steps" -ForegroundColor Cyan -Write-Host "================================================" -ForegroundColor Cyan -Write-Host "" -Write-Host "1. Grant Application Gateway access to Key Vault" -ForegroundColor Yellow -Write-Host " - Enable managed identity on Application Gateway" -ForegroundColor White -Write-Host " - Grant 'Get' permission on secrets to the identity" -ForegroundColor White -Write-Host "" -Write-Host "2. Use the unversioned secret ID in Bicep template" -ForegroundColor Yellow -Write-Host " - This allows automatic certificate rotation" -ForegroundColor White -Write-Host "" -Write-Host " Self-signed certificate notes:" -ForegroundColor Yellow -Write-Host " - Browser will show security warning" -ForegroundColor White -Write-Host " - Valid for 12 months" -ForegroundColor White -Write-Host " - For production, replace with CA-signed certificate" -ForegroundColor White -Write-Host "" - # Output for pipeline use Write-Host "Setting pipeline variables..." -ForegroundColor Yellow Write-Host "##vso[task.setvariable variable=KeyVaultSecretId]$unversionedSecretId" From b73f5f61ef7851c0af6b9057d4dcbad247399c73 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Wed, 19 Nov 2025 17:49:16 -0800 Subject: [PATCH 092/133] remove unnecessary comments --- eng/provision-appgw.yaml | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/eng/provision-appgw.yaml b/eng/provision-appgw.yaml index 4a214b906..419404db1 100644 --- a/eng/provision-appgw.yaml +++ b/eng/provision-appgw.yaml @@ -276,16 +276,6 @@ steps: Write-Host "šŸŽ‰ Your Grafana is immediately accessible at:" Write-Host " $accessUrl" Write-Host "" - Write-Host "šŸ”’ HTTPS-only configuration" - Write-Host " Certificate managed in Azure Key Vault" - Write-Host " Self-signed certificate (browser will show security warning)" - Write-Host "" - Write-Host "The cloudapp.azure.com domain is automatically configured!" - Write-Host "No DNS records needed - the domain works right away." - Write-Host "" - Write-Host "Original Grafana Endpoint: $(GrafanaEndpointFull)" - Write-Host "Backend FQDN: $(GrafanaEndpoint)" - Write-Host "" Write-Host "================================================" Write-Host "NEXT STEPS" Write-Host "================================================" @@ -298,14 +288,6 @@ steps: Write-Host " Certificate managed in Key Vault: $(KeyVaultName)" Write-Host " Thumbprint: $(CertificateThumbprint)" Write-Host "" - Write-Host " HTTP (port 80) is DISABLED - only HTTPS is supported" - Write-Host "" - Write-Host " For production, replace the self-signed certificate:" - Write-Host " 1. Import CA-signed certificate to Key Vault:" - Write-Host " az keyvault certificate import --vault-name $(KeyVaultName) \" - Write-Host " --name appgw-ssl-cert --file certificate.pfx" - Write-Host " 2. Application Gateway auto-updates (no redeployment needed)" - Write-Host "" Write-Host "The domain $customDomain is ready to use!" Write-Host "" Write-Host "================================================" @@ -377,7 +359,6 @@ steps: Write-Host "āœ“ Application Gateway deployment verified successfully" Write-Host "" Write-Host "šŸŽ‰ SUCCESS! Grafana is now accessible at: https://$customDomain" - Write-Host " (HTTPS only - HTTP disabled)" Write-Host "" Write-Host "Note: Backend health probes may take an additional 2-5 minutes to show as healthy" Write-Host "You can check status with:" From e24c9dfa90113da374c9c7a01e80206c13ae2ec1 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Wed, 19 Nov 2025 18:01:02 -0800 Subject: [PATCH 093/133] remove unnecessary comments --- eng/provision-grafana.yaml | 3 --- 1 file changed, 3 deletions(-) diff --git a/eng/provision-grafana.yaml b/eng/provision-grafana.yaml index 2862a0bc9..2aa67a0d9 100644 --- a/eng/provision-grafana.yaml +++ b/eng/provision-grafana.yaml @@ -338,7 +338,6 @@ jobs: Write-Host " Location: $($workspace.location)" Write-Host " SKU: $($workspace.sku.name)" Write-Host " Status: $($workspace.properties.provisioningState)" - Write-Host " Identity Type: $($workspace.identity.type)" # Display user-assigned identity details if ($workspace.identity.type -eq "UserAssigned") { @@ -350,8 +349,6 @@ jobs: $identityName = $identityId.Split('/')[-1] Write-Host " Name: $identityName" Write-Host " Resource ID: $identityId" - Write-Host " Principal ID: $($_.Value.principalId)" - Write-Host " Client ID: $($_.Value.clientId)" } } } else { From a0fd96779ff8506cf35ce7b0cda2dfe8047b543b Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Wed, 19 Nov 2025 18:49:31 -0800 Subject: [PATCH 094/133] change token lifespan to 30 days --- eng/setup-grafana-api-token.ps1 | 37 ++++----------------------------- 1 file changed, 4 insertions(+), 33 deletions(-) diff --git a/eng/setup-grafana-api-token.ps1 b/eng/setup-grafana-api-token.ps1 index 89565389f..419b7b817 100644 --- a/eng/setup-grafana-api-token.ps1 +++ b/eng/setup-grafana-api-token.ps1 @@ -1,21 +1,4 @@ #!/usr/bin/env pwsh -<# -.SYNOPSIS - Sets up Grafana API token in Key Vault for dashboard publishing -.DESCRIPTION - This script helps you create and store a Grafana API token in Azure Key Vault - for use by the dashboard publishing pipeline. -.PARAMETER Environment - The deployment environment (Staging or Production) -.PARAMETER ApiToken - The Grafana API token (if you already have one) -.PARAMETER KeyVaultName - The name of the Key Vault to store the token in -.EXAMPLE - .\setup-grafana-api-token.ps1 -Environment Staging -KeyVaultName "dnceng-amg-int-kv" -.EXAMPLE - .\setup-grafana-api-token.ps1 -Environment Production -KeyVaultName "dnceng-amg-prod-kv" -ApiToken "glsa_xxx" -#> param( [Parameter(Mandatory=$true)] @@ -110,9 +93,6 @@ if (-not $ApiToken) { Write-Host "Automated Service Account Creation" Write-Host "==========================================" Write-Host "" - Write-Host "This will automatically create a Grafana service account and token." - Write-Host "Using Azure CLI to authenticate to Grafana..." - Write-Host "" # Check if AMG extension is installed Write-Host "Checking Azure CLI Grafana extension..." @@ -195,8 +175,8 @@ if (-not $ApiToken) { Write-Host "" - # Create service account token (expires in 1 day = 86400 seconds) - Write-Host "Creating service account token (expires in 1 day)..." + # Create service account token (expires in 30 days = 2592000 seconds) + Write-Host "Creating service account token (expires in 30 days)..." $tokenName = "ci-cd-token-$(Get-Date -Format 'yyyyMMdd-HHmmss')" @@ -205,7 +185,7 @@ if (-not $ApiToken) { --resource-group $resourceGroup ` --service-account $serviceAccountId ` --token $tokenName ` - --time-to-live "1d" ` + --time-to-live "30d" ` -o json if ($LASTEXITCODE -ne 0) { @@ -220,7 +200,7 @@ if (-not $ApiToken) { Write-Host "āœ“ Service account token created" Write-Host " Token name: $tokenName" Write-Host " Token ID: $($tokenResponse.id)" - Write-Host " Expires in: 1 day (86400 seconds)" + Write-Host " Expires in: 30 days (2592000 seconds)" Write-Host "" } @@ -273,12 +253,3 @@ Write-Host "" Write-Host "The pipeline can now publish dashboards to:" Write-Host " $grafanaEndpoint" Write-Host "" -Write-Host "To test dashboard publishing locally, run:" -Write-Host " dotnet build src\Monitoring\Monitoring.ArcadeServices\Monitoring.ArcadeServices.proj \" -Write-Host " -t:PublishGrafana \" -Write-Host " -p:GrafanaHost=$grafanaEndpoint \" -Write-Host " -p:GrafanaAccessToken= \" -Write-Host " -p:GrafanaKeyVaultName=$keyVaultName \" -Write-Host " -p:GrafanaEnvironment=$Environment \" -Write-Host " -p:ParametersFile=parameters.json" -Write-Host "" From 3dd87e9874a95088f8f0720780fbef69cdbc81c6 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Wed, 19 Nov 2025 20:29:28 -0800 Subject: [PATCH 095/133] remove unnecessary comments --- src/Monitoring/Sdk/DeployPublisher.cs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/Monitoring/Sdk/DeployPublisher.cs b/src/Monitoring/Sdk/DeployPublisher.cs index a144da730..b99fd47be 100644 --- a/src/Monitoring/Sdk/DeployPublisher.cs +++ b/src/Monitoring/Sdk/DeployPublisher.cs @@ -63,10 +63,8 @@ public async Task PostToGrafanaAsync() { await PostDatasourcesAsync().ConfigureAwait(false); - // Post contact points for unified alerting (Azure Managed Grafana) await PostContactPointsAsync().ConfigureAwait(false); - // Post alert rules for unified alerting (Azure Managed Grafana) await PostAlertRulesAsync().ConfigureAwait(false); await PostDashboardsAsync().ConfigureAwait(false); @@ -151,7 +149,7 @@ private async Task PostContactPointsAsync() private async Task PostAlertRulesAsync() { - // Check if alert rules directory exists (optional feature) + // Check if alert rules directory exists if (!Directory.Exists(AlertRuleDirectory)) { Log.LogMessage(MessageImportance.Low, "No alert rules directory found at {0}, skipping alert rules", AlertRuleDirectory); From 2b656351d0c3bba64220c5508bd693f4f652ec9c Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Wed, 19 Nov 2025 23:34:17 -0800 Subject: [PATCH 096/133] remove unused contact point --- .../Production/statusHook.imageless.notification.json | 11 ----------- .../Staging/statusHook.imageless.notification.json | 11 ----------- 2 files changed, 22 deletions(-) delete mode 100644 src/Monitoring/Monitoring.ArcadeServices/notifications/Production/statusHook.imageless.notification.json delete mode 100644 src/Monitoring/Monitoring.ArcadeServices/notifications/Staging/statusHook.imageless.notification.json diff --git a/src/Monitoring/Monitoring.ArcadeServices/notifications/Production/statusHook.imageless.notification.json b/src/Monitoring/Monitoring.ArcadeServices/notifications/Production/statusHook.imageless.notification.json deleted file mode 100644 index 5fefae928..000000000 --- a/src/Monitoring/Monitoring.ArcadeServices/notifications/Production/statusHook.imageless.notification.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "name": ".NET Status Alert (no image)", - "type": "webhook", - "disableResolveMessage": false, - "settings": { - "url": "https://dotneteng-status.azurewebsites.net/api/alert", - "httpMethod": "POST", - "username": "ignored", - "password": "[vault(dotnet-build-bot-dotnet-eng-status-token)]" - } -} \ No newline at end of file diff --git a/src/Monitoring/Monitoring.ArcadeServices/notifications/Staging/statusHook.imageless.notification.json b/src/Monitoring/Monitoring.ArcadeServices/notifications/Staging/statusHook.imageless.notification.json deleted file mode 100644 index 3ebd2831b..000000000 --- a/src/Monitoring/Monitoring.ArcadeServices/notifications/Staging/statusHook.imageless.notification.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "name": ".NET Status Alert (no image)", - "type": "webhook", - "disableResolveMessage": false, - "settings": { - "url": "https://dotneteng-status-staging.azurewebsites.net/api/alert", - "httpMethod": "POST", - "username": "ignored", - "password": "[vault(dotnet-build-bot-dotnet-eng-status-token)]" - } -} \ No newline at end of file From d5582a7e6b50ac0cc69ee3e32900f7fff17c59b7 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Thu, 20 Nov 2025 09:59:18 -0800 Subject: [PATCH 097/133] fix duplication of contact points --- src/Monitoring/Sdk/GrafanaClient.cs | 52 +++++++++++++++++++++-------- 1 file changed, 38 insertions(+), 14 deletions(-) diff --git a/src/Monitoring/Sdk/GrafanaClient.cs b/src/Monitoring/Sdk/GrafanaClient.cs index eea821e7e..d382edbee 100644 --- a/src/Monitoring/Sdk/GrafanaClient.cs +++ b/src/Monitoring/Sdk/GrafanaClient.cs @@ -189,25 +189,49 @@ public async Task CreateContactPointAsync(JObject contactPoint) { string name = contactPoint.Value("name"); - // Check if contact point already exists by name - JObject existing = await GetContactPointAsync(name).ConfigureAwait(false); + // List all contact points to find if one with this name already exists + var listUri = new Uri(new Uri(_baseUrl), "/api/v1/provisioning/contact-points"); - if (existing != null) + using (HttpResponseMessage listResponse = await _client.GetAsync(listUri).ConfigureAwait(false)) { - // Update existing contact point using PUT - var uri = new Uri(new Uri(_baseUrl), $"/api/v1/provisioning/contact-points/{Uri.EscapeDataString(name)}"); + await listResponse.EnsureSuccessWithContentAsync(); - // Preserve the existing uid - contactPoint["uid"] = existing.Value("uid"); + JArray allContactPoints; + using (Stream stream = await listResponse.Content.ReadAsStreamAsync().ConfigureAwait(false)) + using (var streamReader = new StreamReader(stream)) + using (var jsonReader = new JsonTextReader(streamReader)) + { + allContactPoints = await JArray.LoadAsync(jsonReader).ConfigureAwait(false); + } - await SendObjectAsync(contactPoint, uri, HttpMethod.Put).ConfigureAwait(false); - } - else - { - // Create new contact point using POST - var uri = new Uri(new Uri(_baseUrl), "/api/v1/provisioning/contact-points"); - await SendObjectAsync(contactPoint, uri, HttpMethod.Post).ConfigureAwait(false); + // Find existing contact point by name + JObject existing = null; + foreach (JToken item in allContactPoints) + { + if (item is JObject cp && cp.Value("name") == name) + { + existing = cp; + break; + } + } + + if (existing != null) + { + // Update existing contact point using UID + string existingUid = existing.Value("uid"); + var updateUri = new Uri(new Uri(_baseUrl), $"/api/v1/provisioning/contact-points/{Uri.EscapeDataString(existingUid)}"); + + // Preserve the existing uid + contactPoint["uid"] = existingUid; + + await SendObjectAsync(contactPoint, updateUri, HttpMethod.Put).ConfigureAwait(false); + return; + } } + + // Create new contact point using POST only if not found + var createUri = new Uri(new Uri(_baseUrl), "/api/v1/provisioning/contact-points"); + await SendObjectAsync(contactPoint, createUri, HttpMethod.Post).ConfigureAwait(false); } private async Task CreateOrUpdateAsync( From 070fbeb0c36e872229ef242c36ec1b3b47bb3810 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Thu, 20 Nov 2025 13:51:52 -0800 Subject: [PATCH 098/133] include dashboard for homepage --- .../dashboard/general/home.dashboard.json | 621 ++++++++++++++++++ .../Monitoring.ArcadeServices/parameters.json | 7 + src/Monitoring/Sdk/DeployPublisher.cs | 38 ++ src/Monitoring/Sdk/GrafanaClient.cs | 43 +- 4 files changed, 696 insertions(+), 13 deletions(-) create mode 100644 src/Monitoring/Monitoring.ArcadeServices/dashboard/general/home.dashboard.json diff --git a/src/Monitoring/Monitoring.ArcadeServices/dashboard/general/home.dashboard.json b/src/Monitoring/Monitoring.ArcadeServices/dashboard/general/home.dashboard.json new file mode 100644 index 000000000..e31bd24a1 --- /dev/null +++ b/src/Monitoring/Monitoring.ArcadeServices/dashboard/general/home.dashboard.json @@ -0,0 +1,621 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "links": [], + "liveNow": false, + "panels": [ + { + "gridPos": { + "h": 8, + "w": 14, + "x": 0, + "y": 0 + }, + "id": 2, + "options": { + "content": "\n# .NET Engineering Systems Monitoring\n\nFor questions or permission issues, email [dnceng@microsoft.com](mailto:dnceng@microsoft.com)\n\nThis monitoring site is used to monitor all services managed by the .NET Engineering team.\nFor information about what sorts of things are monitored, and how to go about adding new monitoring or alerting, see the [Guidance](https://github.com/dotnet/core-eng/blob/master/Documentation/Alerting.md).\n\nTo see information about privacy and cookies visit: [Microsoft Privacy Statement](https://go.microsoft.com/fwlink/?LinkId=521839).\n\n", + "mode": "markdown" + }, + "pluginVersion": "8.3.6", + "title": "Home", + "type": "text" + }, + { + "gridPos": { + "h": 6, + "w": 7, + "x": 17, + "y": 0 + }, + "id": 6, + "options": { + "folderId": 46, + "maxItems": 10, + "query": "", + "showHeadings": false, + "showRecentlyViewed": false, + "showSearch": true, + "showStarred": false, + "tags": [] + }, + "pluginVersion": "8.3.6", + "title": "arcade-services", + "type": "dashlist" + }, + { + "gridPos": { + "h": 7, + "w": 7, + "x": 17, + "y": 6 + }, + "id": 7, + "options": { + "folderId": 17, + "maxItems": 10, + "query": "", + "showHeadings": false, + "showRecentlyViewed": false, + "showSearch": true, + "showStarred": false, + "tags": [] + }, + "pluginVersion": "8.3.6", + "title": "helix-services", + "type": "dashlist" + }, + { + "alert": { + "alertRuleTags": {}, + "conditions": [ + { + "evaluator": { + "params": [ + 30 + ], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "A", + "5m", + "now" + ] + }, + "reducer": { + "params": [], + "type": "min" + }, + "type": "query" + } + ], + "executionErrorState": "alerting", + "for": "5m", + "frequency": "5m", + "handler": 1, + "message": "95 percentile of work item waiting times is over 30 minutes. BuildPool queues only.", + "name": "Work Items Waiting Time Is Too High (Build Pools) alert", + "noDataState": "ok", + "notifications": [] + }, + "datasource": { + "type": "grafana-azure-data-explorer-datasource", + "uid": "OlcfOPi7z" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "line+area" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "transparent", + "value": null + }, + { + "color": "red", + "value": 30 + } + ] + }, + "unit": "m" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 7, + "x": 0, + "y": 8 + }, + "id": 4, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "database": "engineeringdata", + "datasource": { + "type": "grafana-azure-data-explorer-datasource", + "uid": "OlcfOPi7z" + }, + "expression": { + "groupBy": { + "expressions": [], + "type": "and" + }, + "reduce": { + "expressions": [], + "type": "and" + }, + "where": { + "expressions": [], + "type": "and" + } + }, + "pluginVersion": "3.5.0", + "query": "WorkItems \n| where $__timeFilter(Queued)\n| project QueueName = tolower(QueueName), duration = datetime_diff('second', Started, Queued), Queued\n| where QueueName !contains \"osx\" and QueueName !contains \"perf\" and QueueName !contains \"arm\" and QueueName !contains \"arcade\" and QueueName !contains \"xaml\" and QueueName !contains \"appcompat\"\n| where QueueName contains \"buildpool\"\n| summarize Percentile95 = percentile(duration, 95) / 60 by bin(Queued, 1d)\n| order by Queued asc ", + "querySource": "raw", + "rawMode": true, + "refId": "A", + "resultFormat": "time_series" + }, + { + "database": "engineeringdata", + "datasource": { + "type": "grafana-azure-data-explorer-datasource", + "uid": "OlcfOPi7z" + }, + "expression": { + "groupBy": { + "expressions": [], + "type": "and" + }, + "reduce": { + "expressions": [], + "type": "and" + }, + "where": { + "expressions": [], + "type": "and" + } + }, + "hide": false, + "pluginVersion": "3.5.0", + "query": "WorkItems \n| where $__timeFilter(Queued)\n| project QueueName = tolower(QueueName), duration = datetime_diff('second', Started, Queued), Queued\n| where QueueName !contains \"osx\" and QueueName !contains \"perf\" and QueueName !contains \"arm\" and QueueName !contains \"arcade\" and QueueName !contains \"xaml\" and QueueName !contains \"appcompat\"\n| where QueueName contains \"buildpool\"\n| summarize Percentile50 = percentile(duration, 50) / 60 by bin(Queued, 1d)\n| order by Queued asc ", + "querySource": "raw", + "rawMode": true, + "refId": "B", + "resultFormat": "time_series" + } + ], + "thresholds": [ + { + "colorMode": "critical", + "op": "gt", + "value": 30, + "visible": true + } + ], + "title": "Work Items Waiting Time (Build Pools)", + "type": "timeseries" + }, + { + "alert": { + "alertRuleTags": {}, + "conditions": [ + { + "evaluator": { + "params": [ + 35 + ], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "A", + "5m", + "now" + ] + }, + "reducer": { + "params": [], + "type": "min" + }, + "type": "query" + } + ], + "executionErrorState": "alerting", + "for": "5m", + "frequency": "5m", + "handler": 1, + "message": "95 percentile of work item waiting times is over 35 minutes. Test queues only.", + "name": "Work Items Waiting Time Is Too High (Test Queues)", + "noDataState": "ok", + "notifications": [] + }, + "datasource": { + "type": "grafana-azure-data-explorer-datasource", + "uid": "OlcfOPi7z" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "line+area" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "transparent", + "value": null + }, + { + "color": "red", + "value": 30 + } + ] + }, + "unit": "m" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 7, + "x": 7, + "y": 8 + }, + "id": 10, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "database": "engineeringdata", + "datasource": { + "type": "grafana-azure-data-explorer-datasource", + "uid": "OlcfOPi7z" + }, + "expression": { + "groupBy": { + "expressions": [], + "type": "and" + }, + "reduce": { + "expressions": [], + "type": "and" + }, + "where": { + "expressions": [], + "type": "and" + } + }, + "pluginVersion": "3.5.0", + "query": "WorkItems \n| where $__timeFilter(Queued)\n| project QueueName = tolower(QueueName), duration = datetime_diff('second', Started, Queued), Queued\n| where QueueName !contains \"osx\" and QueueName !contains \"perf\" and QueueName !contains \"arm\" and QueueName !contains \"arcade\" and QueueName !contains \"xaml\" and QueueName !contains \"appcompat\" and QueueName !contains \".tof\"\n| where QueueName !contains \"buildpool\"\n| summarize Percentile95 = percentile(duration, 95) / 60 by bin(Queued, 1d)\n| order by Queued asc ", + "querySource": "raw", + "rawMode": true, + "refId": "A", + "resultFormat": "time_series" + }, + { + "database": "engineeringdata", + "datasource": { + "type": "grafana-azure-data-explorer-datasource", + "uid": "OlcfOPi7z" + }, + "expression": { + "groupBy": { + "expressions": [], + "type": "and" + }, + "reduce": { + "expressions": [], + "type": "and" + }, + "where": { + "expressions": [], + "type": "and" + } + }, + "hide": false, + "pluginVersion": "3.5.0", + "query": "WorkItems \n| where $__timeFilter(Queued)\n| project QueueName = tolower(QueueName), duration = datetime_diff('second', Started, Queued), Queued\n| where QueueName !contains \"osx\" and QueueName !contains \"perf\" and QueueName !contains \"arm\" and QueueName !contains \"arcade\" and QueueName !contains \"xaml\" and QueueName !contains \"appcompat\"\n| where QueueName !contains \"buildpool\"\n| summarize Percentile50 = percentile(duration, 50) / 60 by bin(Queued, 1d)\n| order by Queued asc ", + "querySource": "raw", + "rawMode": true, + "refId": "B", + "resultFormat": "time_series" + } + ], + "thresholds": [ + { + "colorMode": "critical", + "op": "gt", + "value": 35, + "visible": true + } + ], + "title": "Work Items Waiting Time (Test Queues)", + "type": "timeseries" + }, + { + "gridPos": { + "h": 13, + "w": 7, + "x": 17, + "y": 13 + }, + "id": 9, + "options": { + "alertName": "", + "dashboardAlerts": false, + "dashboardTitle": "", + "maxItems": 10, + "showOptions": "current", + "sortOrder": 1, + "stateFilter": { + "alerting": true, + "execution_error": false, + "no_data": false, + "ok": false, + "paused": false, + "pending": false + }, + "tags": [] + }, + "pluginVersion": "8.3.6", + "title": "Currently Active Alerts", + "type": "alertlist" + }, + { + "datasource": { + "type": "grafana-azure-data-explorer-datasource", + "uid": "OlcfOPi7z" + }, + "description": "This panel shows all the times the Helix client crashed while processing a work item. The goal for this is to be greater than 99.99% reliability.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMax": 1.0002, + "axisSoftMin": 0.9998, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "line" + } + }, + "decimals": 3, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "transparent", + "value": null + }, + { + "color": "red", + "value": 0.9999 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 9, + "x": 0, + "y": 17 + }, + "id": 11, + "options": { + "legend": { + "calcs": [], + "displayMode": "hidden", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + }, + "targets": [ + { + "database": "engineeringdata", + "datasource": { + "type": "grafana-azure-data-explorer-datasource", + "uid": "OlcfOPi7z" + }, + "expression": { + "groupBy": { + "expressions": [], + "type": "and" + }, + "reduce": { + "expressions": [], + "type": "and" + }, + "where": { + "expressions": [], + "type": "and" + } + }, + "pluginVersion": "3.5.0", + "query": "WorkItems\n| where Finished > ago(120d)\n| extend my=startofmonth(Finished)\n| summarize Pass=countif(Status != \"None\"), Fail=countif(Status == \"None\") by my\n| project my, PassPercent = toreal(Pass) / toreal(Pass + Fail)\n| order by my desc \n| limit 4\n| order by my asc", + "querySource": "raw", + "rawMode": true, + "refId": "A", + "resultFormat": "time_series" + } + ], + "title": "Number of Crashes in Helix", + "type": "timeseries" + } + ], + "schemaVersion": 34, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-4M", + "to": "now" + }, + "timepicker": { + "hidden": false, + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ], + "type": "timepicker" + }, + "timezone": "browser", + "title": "Home", + "uid": "home", + "weekStart": "" +} \ No newline at end of file diff --git a/src/Monitoring/Monitoring.ArcadeServices/parameters.json b/src/Monitoring/Monitoring.ArcadeServices/parameters.json index 01cc7c57f..78079fcc0 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/parameters.json +++ b/src/Monitoring/Monitoring.ArcadeServices/parameters.json @@ -173,5 +173,12 @@ "Staging": "e6b5f9f5-0ca4-4351-879b-014d78400ec2", "Production": "fbd6122a-9ad3-42e4-976e-bccb82486856" } + }, + { + "Name": "home-dashboard-uid", + "Values": { + "Staging": "home", + "Production": "home" + } } ] \ No newline at end of file diff --git a/src/Monitoring/Sdk/DeployPublisher.cs b/src/Monitoring/Sdk/DeployPublisher.cs index b99fd47be..891419108 100644 --- a/src/Monitoring/Sdk/DeployPublisher.cs +++ b/src/Monitoring/Sdk/DeployPublisher.cs @@ -68,6 +68,8 @@ public async Task PostToGrafanaAsync() await PostAlertRulesAsync().ConfigureAwait(false); await PostDashboardsAsync().ConfigureAwait(false); + + await SetHomeDashboardAsync().ConfigureAwait(false); } private async Task PostDatasourcesAsync() @@ -367,4 +369,40 @@ private SecretClient GetKeyVaultClient() Uri vaultUri = new($"https://{_keyVaultName}.vault.azure.net/"); return new SecretClient(vaultUri, _tokenCredential); } + + private async Task SetHomeDashboardAsync() + { + // Load parameters to get home dashboard UID + List parameters; + using (StreamReader sr = new StreamReader(_parameterFile)) + using (JsonReader jr = new JsonTextReader(sr)) + { + JsonSerializer jsonSerializer = new JsonSerializer(); + parameters = jsonSerializer.Deserialize>(jr); + } + + if (parameters == null) + { + Log.LogMessage(MessageImportance.Normal, "No parameters file found, skipping home dashboard configuration"); + return; + } + + // Find the home-dashboard-uid parameter + var homeDashboardParam = parameters.FirstOrDefault(p => p.Name == "home-dashboard-uid"); + if (homeDashboardParam == null || !homeDashboardParam.Values.TryGetValue(_environment, out string dashboardUid)) + { + Log.LogMessage(MessageImportance.Normal, "No home-dashboard-uid parameter found for environment {0}, skipping home dashboard configuration", _environment); + return; + } + + if (string.IsNullOrWhiteSpace(dashboardUid)) + { + Log.LogMessage(MessageImportance.Normal, "Home dashboard UID is empty, skipping home dashboard configuration"); + return; + } + + Log.LogMessage(MessageImportance.Normal, "Setting home dashboard to: {0}", dashboardUid); + await GrafanaClient.SetHomeDashboardAsync(dashboardUid).ConfigureAwait(false); + Log.LogMessage(MessageImportance.Normal, "Successfully set home dashboard"); + } } diff --git a/src/Monitoring/Sdk/GrafanaClient.cs b/src/Monitoring/Sdk/GrafanaClient.cs index d382edbee..ac04a9375 100644 --- a/src/Monitoring/Sdk/GrafanaClient.cs +++ b/src/Monitoring/Sdk/GrafanaClient.cs @@ -130,26 +130,32 @@ public async Task GetDataSourceByUidAsync(string uid) } } - public Task CreateFolderAsync(string uid, string title) + public async Task CreateFolderAsync(string uid, string title) { + // First try to get the folder - if it exists, just return it + // This handles the built-in "general" folder which can't be updated + var getUri = new Uri(new Uri(_baseUrl), $"/api/folders/{Uri.EscapeDataString(uid)}"); + using (HttpResponseMessage getResponse = await _client.GetAsync(getUri).ConfigureAwait(false)) + { + if (getResponse.IsSuccessStatusCode) + { + using (Stream stream = await getResponse.Content.ReadAsStreamAsync().ConfigureAwait(false)) + using (var streamReader = new StreamReader(stream)) + using (var jsonReader = new JsonTextReader(streamReader)) + { + return await JObject.LoadAsync(jsonReader).ConfigureAwait(false); + } + } + } + + // Folder doesn't exist, create it var folder = new JObject { {"uid", uid}, {"title", title}, }; - return CreateOrUpdateAsync( - folder, - folder.Value("uid"), - u => $"/api/folders/{Uri.EscapeDataString(u)}", - "/api/folders", - _ => (HttpMethod.Put, $"/api/folders/{uid}"), - (d, x) => - { - d.Remove("uid"); - d["version"] = x.Value("version"); - } - ); + return await SendObjectAsync(folder, new Uri(new Uri(_baseUrl), "/api/folders")).ConfigureAwait(false); } public Task CreateDatasourceAsync(JObject datasource) @@ -428,6 +434,17 @@ public async Task CreateAlertRuleAsync(JObject alertRule) } } + public async Task SetHomeDashboardAsync(string dashboardUid) + { + var preferences = new JObject + { + {"homeDashboardUID", dashboardUid} + }; + + var uri = new Uri(new Uri(_baseUrl), "/api/org/preferences"); + await SendObjectAsync(preferences, uri, HttpMethod.Put).ConfigureAwait(false); + } + public void Dispose() { _client?.Dispose(); From 3f104200de39f102594bdf6d9726621eeb7a4db8 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Fri, 21 Nov 2025 09:46:14 -0800 Subject: [PATCH 099/133] remove plugin version --- .../dashboard/general/home.dashboard.json | 9 --------- 1 file changed, 9 deletions(-) diff --git a/src/Monitoring/Monitoring.ArcadeServices/dashboard/general/home.dashboard.json b/src/Monitoring/Monitoring.ArcadeServices/dashboard/general/home.dashboard.json index e31bd24a1..811caae0a 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/dashboard/general/home.dashboard.json +++ b/src/Monitoring/Monitoring.ArcadeServices/dashboard/general/home.dashboard.json @@ -36,7 +36,6 @@ "content": "\n# .NET Engineering Systems Monitoring\n\nFor questions or permission issues, email [dnceng@microsoft.com](mailto:dnceng@microsoft.com)\n\nThis monitoring site is used to monitor all services managed by the .NET Engineering team.\nFor information about what sorts of things are monitored, and how to go about adding new monitoring or alerting, see the [Guidance](https://github.com/dotnet/core-eng/blob/master/Documentation/Alerting.md).\n\nTo see information about privacy and cookies visit: [Microsoft Privacy Statement](https://go.microsoft.com/fwlink/?LinkId=521839).\n\n", "mode": "markdown" }, - "pluginVersion": "8.3.6", "title": "Home", "type": "text" }, @@ -58,7 +57,6 @@ "showStarred": false, "tags": [] }, - "pluginVersion": "8.3.6", "title": "arcade-services", "type": "dashlist" }, @@ -80,7 +78,6 @@ "showStarred": false, "tags": [] }, - "pluginVersion": "8.3.6", "title": "helix-services", "type": "dashlist" }, @@ -214,7 +211,6 @@ "type": "and" } }, - "pluginVersion": "3.5.0", "query": "WorkItems \n| where $__timeFilter(Queued)\n| project QueueName = tolower(QueueName), duration = datetime_diff('second', Started, Queued), Queued\n| where QueueName !contains \"osx\" and QueueName !contains \"perf\" and QueueName !contains \"arm\" and QueueName !contains \"arcade\" and QueueName !contains \"xaml\" and QueueName !contains \"appcompat\"\n| where QueueName contains \"buildpool\"\n| summarize Percentile95 = percentile(duration, 95) / 60 by bin(Queued, 1d)\n| order by Queued asc ", "querySource": "raw", "rawMode": true, @@ -242,7 +238,6 @@ } }, "hide": false, - "pluginVersion": "3.5.0", "query": "WorkItems \n| where $__timeFilter(Queued)\n| project QueueName = tolower(QueueName), duration = datetime_diff('second', Started, Queued), Queued\n| where QueueName !contains \"osx\" and QueueName !contains \"perf\" and QueueName !contains \"arm\" and QueueName !contains \"arcade\" and QueueName !contains \"xaml\" and QueueName !contains \"appcompat\"\n| where QueueName contains \"buildpool\"\n| summarize Percentile50 = percentile(duration, 50) / 60 by bin(Queued, 1d)\n| order by Queued asc ", "querySource": "raw", "rawMode": true, @@ -391,7 +386,6 @@ "type": "and" } }, - "pluginVersion": "3.5.0", "query": "WorkItems \n| where $__timeFilter(Queued)\n| project QueueName = tolower(QueueName), duration = datetime_diff('second', Started, Queued), Queued\n| where QueueName !contains \"osx\" and QueueName !contains \"perf\" and QueueName !contains \"arm\" and QueueName !contains \"arcade\" and QueueName !contains \"xaml\" and QueueName !contains \"appcompat\" and QueueName !contains \".tof\"\n| where QueueName !contains \"buildpool\"\n| summarize Percentile95 = percentile(duration, 95) / 60 by bin(Queued, 1d)\n| order by Queued asc ", "querySource": "raw", "rawMode": true, @@ -419,7 +413,6 @@ } }, "hide": false, - "pluginVersion": "3.5.0", "query": "WorkItems \n| where $__timeFilter(Queued)\n| project QueueName = tolower(QueueName), duration = datetime_diff('second', Started, Queued), Queued\n| where QueueName !contains \"osx\" and QueueName !contains \"perf\" and QueueName !contains \"arm\" and QueueName !contains \"arcade\" and QueueName !contains \"xaml\" and QueueName !contains \"appcompat\"\n| where QueueName !contains \"buildpool\"\n| summarize Percentile50 = percentile(duration, 50) / 60 by bin(Queued, 1d)\n| order by Queued asc ", "querySource": "raw", "rawMode": true, @@ -463,7 +456,6 @@ }, "tags": [] }, - "pluginVersion": "8.3.6", "title": "Currently Active Alerts", "type": "alertlist" }, @@ -565,7 +557,6 @@ "type": "and" } }, - "pluginVersion": "3.5.0", "query": "WorkItems\n| where Finished > ago(120d)\n| extend my=startofmonth(Finished)\n| summarize Pass=countif(Status != \"None\"), Fail=countif(Status == \"None\") by my\n| project my, PassPercent = toreal(Pass) / toreal(Pass + Fail)\n| order by my desc \n| limit 4\n| order by my asc", "querySource": "raw", "rawMode": true, From 05178967acaa8cebcbef8eba2ecb42441f0c7402 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Fri, 21 Nov 2025 09:56:48 -0800 Subject: [PATCH 100/133] remove app gateway logic --- eng/deploy-managed-grafana.yml | 22 -- eng/deployment/azure-appgw-grafana.bicep | 283 ----------------- eng/provision-appgw.yaml | 367 ----------------------- 3 files changed, 672 deletions(-) delete mode 100644 eng/deployment/azure-appgw-grafana.bicep delete mode 100644 eng/provision-appgw.yaml diff --git a/eng/deploy-managed-grafana.yml b/eng/deploy-managed-grafana.yml index a96681fec..9d5099a0f 100644 --- a/eng/deploy-managed-grafana.yml +++ b/eng/deploy-managed-grafana.yml @@ -28,28 +28,6 @@ stages: GrafanaLocation: 'westus2' GrafanaKeyVault: ${{ parameters.GrafanaKeyVault }} -- stage: ProvisionApplicationGateway - displayName: 'Provision Application Gateway Custom Domain' - dependsOn: ProvisionGrafana - variables: - GrafanaIdentityId: $[ stageDependencies.ProvisionGrafana.ProvisionGrafana.outputs['ExportIdentity.GrafanaIdentityId'] ] - jobs: - - job: DeployApplicationGateway - displayName: 'Deploy Azure Application Gateway' - pool: - name: NetCore1ESPool-Internal - demands: ImageOverride -equals 1es-windows-2022 - steps: - - template: /eng/provision-appgw.yaml@self - parameters: - ServiceConnection: ${{ parameters.ServiceConnectionName }} - ResourceGroupName: 'monitoring-managed' - Environment: ${{ parameters.DeploymentEnvironment }} - GrafanaWorkspaceName: ${{ parameters.GrafanaWorkspaceName }} - Location: 'westus2' - GrafanaIdentityId: $(GrafanaIdentityId) - GrafanaKeyVault: ${{ parameters.GrafanaKeyVault }} - - stage: PublishDashboards displayName: 'Publish Grafana Dashboards' dependsOn: ProvisionGrafana diff --git a/eng/deployment/azure-appgw-grafana.bicep b/eng/deployment/azure-appgw-grafana.bicep deleted file mode 100644 index 32004caf2..000000000 --- a/eng/deployment/azure-appgw-grafana.bicep +++ /dev/null @@ -1,283 +0,0 @@ -// Azure Application Gateway with cloudapp.azure.com domains for Azure Managed Grafana -@description('The Azure region where the Application Gateway will be deployed') -param location string - -@description('The deployment environment (Staging or Production)') -param environment string - -@description('The Grafana workspace endpoint URL (without https://)') -param grafanaEndpoint string - -@description('The SKU name for Application Gateway') -@allowed([ - 'Standard_v2' - 'WAF_v2' -]) -param skuName string = 'WAF_v2' - -@description('The SKU tier for Application Gateway') -@allowed([ - 'Standard_v2' - 'WAF_v2' -]) -param skuTier string = 'WAF_v2' - -@description('The capacity (instance count) for Application Gateway') -@minValue(1) -@maxValue(10) -param capacity int = 2 - -@description('Tags to apply to resources') -param resourceTags object = { - Environment: environment - Purpose: 'Azure Managed Grafana Custom Domain' - Service: 'DncEng' -} - -@description('Key Vault secret ID for the SSL certificate. This is a URI/URL, not sensitive data.') -param certSecretId string = '' - -@description('Resource ID of the user-assigned managed identity created during Grafana provisioning') -param grafanaUserAssignedIdentityId string - -// Generate custom domain name based on environment and region -// Format: dnceng-managed-grafana[-staging].{region}.cloudapp.azure.com -var regionShortName = location == 'westus2' ? 'westus2' : location -var publicDnsLabel = environment == 'Production' ? 'dnceng-managed-grafana' : 'dnceng-managed-grafana-staging' -var customDomainName = '${publicDnsLabel}.${regionShortName}.cloudapp.azure.com' - -// Resource names -var appGwName = environment == 'Production' ? 'dnceng-grafana-appgw' : 'dnceng-grafana-staging-appgw' -var publicIpName = environment == 'Production' ? 'dnceng-grafana-pip' : 'dnceng-grafana-staging-pip' -var vnetName = environment == 'Production' ? 'dnceng-grafana-vnet' : 'dnceng-grafana-staging-vnet' -var wafPolicyName = environment == 'Production' ? 'dnceng-grafana-waf-policy' : 'dnceng-grafana-staging-waf-policy' -var subnetName = 'appgw-subnet' -var backendPoolName = 'grafana-backend-pool' -var frontendPortName = 'https-port' -var frontendIpConfigName = 'appgw-frontend-ip' -var httpSettingName = 'grafana-http-setting' -var listenerName = 'https-listener' -var ruleName = 'https-routing-rule' -var probeName = 'grafana-health-probe' -var sslCertificateName = 'appgw-ssl-cert' - -// WAF Policy -resource wafPolicy 'Microsoft.Network/ApplicationGatewayWebApplicationFirewallPolicies@2023-05-01' = { - name: wafPolicyName - location: location - tags: resourceTags - properties: { - policySettings: { - requestBodyCheck: true - maxRequestBodySizeInKb: 128 - fileUploadLimitInMb: 100 - state: 'Enabled' - mode: 'Prevention' - } - managedRules: { - managedRuleSets: [ - { - ruleSetType: 'OWASP' - ruleSetVersion: '3.2' - } - { - ruleSetType: 'Microsoft_BotManagerRuleSet' - ruleSetVersion: '1.0' - } - ] - } - } -} - -// Virtual Network for Application Gateway -resource vnet 'Microsoft.Network/virtualNetworks@2023-05-01' = { - name: vnetName - location: location - tags: resourceTags - properties: { - addressSpace: { - addressPrefixes: [ - '10.0.0.0/16' - ] - } - subnets: [ - { - name: subnetName - properties: { - addressPrefix: '10.0.0.0/24' - privateEndpointNetworkPolicies: 'Disabled' - privateLinkServiceNetworkPolicies: 'Disabled' - } - } - ] - } -} - -// Public IP for Application Gateway with custom DNS label -resource publicIp 'Microsoft.Network/publicIPAddresses@2023-05-01' = { - name: publicIpName - location: location - tags: resourceTags - sku: { - name: 'Standard' - tier: 'Regional' - } - properties: { - publicIPAllocationMethod: 'Static' - publicIPAddressVersion: 'IPv4' - dnsSettings: { - domainNameLabel: publicDnsLabel - } - idleTimeoutInMinutes: 4 - } -} - -// Application Gateway -resource applicationGateway 'Microsoft.Network/applicationGateways@2023-05-01' = { - name: appGwName - location: location - tags: resourceTags - identity: { - type: 'UserAssigned' - userAssignedIdentities: { - '${grafanaUserAssignedIdentityId}': {} - } - } - properties: { - sku: { - name: skuName - tier: skuTier - capacity: capacity - } - firewallPolicy: { - id: wafPolicy.id - } - gatewayIPConfigurations: [ - { - name: 'appgw-ip-config' - properties: { - subnet: { - id: vnet.properties.subnets[0].id - } - } - } - ] - frontendIPConfigurations: [ - { - name: frontendIpConfigName - properties: { - publicIPAddress: { - id: publicIp.id - } - } - } - ] - frontendPorts: [ - { - name: frontendPortName - properties: { - port: 443 - } - } - ] - backendAddressPools: [ - { - name: backendPoolName - properties: { - backendAddresses: [ - { - fqdn: grafanaEndpoint - } - ] - } - } - ] - sslCertificates: certSecretId != '' ? [ - { - name: sslCertificateName - properties: { - keyVaultSecretId: certSecretId - } - } - ] : [] - backendHttpSettingsCollection: [ - { - name: httpSettingName - properties: { - port: 443 - protocol: 'Https' - cookieBasedAffinity: 'Enabled' - pickHostNameFromBackendAddress: true - requestTimeout: 30 - probe: { - id: resourceId('Microsoft.Network/applicationGateways/probes', appGwName, probeName) - } - } - } - ] - httpListeners: [ - { - name: listenerName - properties: { - frontendIPConfiguration: { - id: resourceId('Microsoft.Network/applicationGateways/frontendIPConfigurations', appGwName, frontendIpConfigName) - } - frontendPort: { - id: resourceId('Microsoft.Network/applicationGateways/frontendPorts', appGwName, frontendPortName) - } - protocol: 'Https' - requireServerNameIndication: false - sslCertificate: certSecretId != '' ? { - id: resourceId('Microsoft.Network/applicationGateways/sslCertificates', appGwName, sslCertificateName) - } : null - } - } - ] - requestRoutingRules: [ - { - name: ruleName - properties: { - ruleType: 'Basic' - priority: 100 - httpListener: { - id: resourceId('Microsoft.Network/applicationGateways/httpListeners', appGwName, listenerName) - } - backendAddressPool: { - id: resourceId('Microsoft.Network/applicationGateways/backendAddressPools', appGwName, backendPoolName) - } - backendHttpSettings: { - id: resourceId('Microsoft.Network/applicationGateways/backendHttpSettingsCollection', appGwName, httpSettingName) - } - } - } - ] - probes: [ - { - name: probeName - properties: { - protocol: 'Https' - path: '/api/health' - interval: 30 - timeout: 30 - unhealthyThreshold: 3 - pickHostNameFromBackendHttpSettings: true - minServers: 0 - match: { - statusCodes: [ - '200-401' - ] - } - } - } - ] - enableHttp2: true - } -} - -// Outputs -output applicationGatewayName string = applicationGateway.name -output applicationGatewayIdentity string = applicationGateway.identity.userAssignedIdentities[grafanaUserAssignedIdentityId].principalId -output publicIpAddress string = publicIp.properties.ipAddress -output customDomainName string = customDomainName -output customDomainUrl string = 'https://${customDomainName}' -output accessUrl string = 'https://${customDomainName}' diff --git a/eng/provision-appgw.yaml b/eng/provision-appgw.yaml deleted file mode 100644 index 419404db1..000000000 --- a/eng/provision-appgw.yaml +++ /dev/null @@ -1,367 +0,0 @@ -parameters: - - name: ServiceConnection - type: string - - name: ResourceGroupName - type: string - - name: Environment - type: string - - name: GrafanaWorkspaceName - type: string - - name: Location - type: string - default: 'westus2' - - name: GrafanaIdentityId - type: string - - name: GrafanaKeyVault - type: string - -steps: - - task: AzureCLI@2 - displayName: 'Validate Application Gateway Bicep Template' - inputs: - azureSubscription: ${{ parameters.ServiceConnection }} - scriptType: 'pscore' - scriptLocation: 'inlineScript' - inlineScript: | - Write-Host "Validating Application Gateway Bicep template..." - az bicep build --file eng/deployment/azure-appgw-grafana.bicep - if ($LASTEXITCODE -ne 0) { - Write-Error "Bicep validation failed" - exit 1 - } - Write-Host "āœ“ Application Gateway Bicep template is valid" - - - task: AzureCLI@2 - displayName: 'Get Grafana Endpoint' - inputs: - azureSubscription: ${{ parameters.ServiceConnection }} - scriptType: 'pscore' - scriptLocation: 'inlineScript' - inlineScript: | - Write-Host "Retrieving Grafana workspace endpoint..." - $grafanaEndpointFull = az grafana show ` - --name "${{ parameters.GrafanaWorkspaceName }}" ` - --resource-group "${{ parameters.ResourceGroupName }}" ` - --query "properties.endpoint" ` - --output tsv - - if ([string]::IsNullOrEmpty($grafanaEndpointFull)) { - Write-Error "Failed to retrieve Grafana endpoint" - exit 1 - } - - # Remove https:// prefix and trailing slash for Application Gateway backend - $grafanaEndpoint = $grafanaEndpointFull -replace '^https://', '' -replace '/$', '' - - Write-Host "Grafana Full Endpoint: $grafanaEndpointFull" - Write-Host "Grafana Backend FQDN: $grafanaEndpoint" - Write-Host "##vso[task.setvariable variable=GrafanaEndpoint]$grafanaEndpoint" - Write-Host "##vso[task.setvariable variable=GrafanaEndpointFull]$grafanaEndpointFull" - - - task: AzureCLI@2 - displayName: 'Grant Pipeline Service Principal Key Vault Certificate Access' - inputs: - azureSubscription: ${{ parameters.ServiceConnection }} - scriptType: 'pscore' - scriptLocation: 'inlineScript' - inlineScript: | - Write-Host "Granting pipeline service principal Key Vault Certificates Officer role..." - - $kvName = "${{ parameters.GrafanaKeyVault }}" - $rgName = "${{ parameters.ResourceGroupName }}" - - # Get the current service principal object ID - $spObjectId = az account show --query "user.name" --output tsv - Write-Host "Service Principal Object ID: $spObjectId" - - # Get the Key Vault resource ID - $kvId = az keyvault show --name $kvName --resource-group $rgName --query "id" --output tsv - Write-Host "Key Vault: $kvName" - Write-Host "Key Vault ID: $kvId" - - # Check if role assignment already exists - $existingAssignment = az role assignment list ` - --assignee $spObjectId ` - --scope $kvId ` - --role "Key Vault Certificates Officer" ` - --query "[0].id" ` - --output tsv - - if ($existingAssignment) { - Write-Host "āœ“ Pipeline service principal already has Key Vault Certificates Officer role" - } else { - Write-Host "Granting Key Vault Certificates Officer role..." - az role assignment create ` - --role "Key Vault Certificates Officer" ` - --assignee $spObjectId ` - --scope $kvId ` - --output none - - if ($LASTEXITCODE -eq 0) { - Write-Host "āœ“ Pipeline service principal granted Key Vault Certificates Officer role" - Write-Host "ā± Waiting 30 seconds for role assignment to propagate..." - Start-Sleep -Seconds 30 - } else { - Write-Error "Failed to grant Key Vault Certificates Officer role" - exit 1 - } - } - - - task: AzureCLI@2 - displayName: 'Grant Pipeline Service Principal Key Vault Secrets Access' - inputs: - azureSubscription: ${{ parameters.ServiceConnection }} - scriptType: 'pscore' - scriptLocation: 'inlineScript' - inlineScript: | - Write-Host "Granting pipeline service principal Key Vault Secrets Officer role..." - - $kvName = "${{ parameters.GrafanaKeyVault }}" - $rgName = "${{ parameters.ResourceGroupName }}" - - # Get the current service principal object ID - $spObjectId = az account show --query "user.name" --output tsv - Write-Host "Service Principal Object ID: $spObjectId" - - # Get the Key Vault resource ID - $kvId = az keyvault show --name $kvName --resource-group $rgName --query "id" --output tsv - Write-Host "Key Vault: $kvName" - Write-Host "Key Vault ID: $kvId" - - # Check if role assignment already exists - $existingAssignment = az role assignment list ` - --assignee $spObjectId ` - --scope $kvId ` - --role "Key Vault Secrets Officer" ` - --query "[0].id" ` - --output tsv - - if ($existingAssignment) { - Write-Host "āœ“ Pipeline service principal already has Key Vault Secrets Officer role" - } else { - Write-Host "Granting Key Vault Secrets Officer role..." - az role assignment create ` - --role "Key Vault Secrets Officer" ` - --assignee $spObjectId ` - --scope $kvId ` - --output none - - if ($LASTEXITCODE -eq 0) { - Write-Host "āœ“ Pipeline service principal granted Key Vault Secrets Officer role" - Write-Host "ā± Waiting 30 seconds for role assignment to propagate..." - Start-Sleep -Seconds 30 - } else { - Write-Error "Failed to grant Key Vault Secrets Officer role" - exit 1 - } - } - - - task: AzureCLI@2 - displayName: 'Setup Certificate in Key Vault' - inputs: - azureSubscription: ${{ parameters.ServiceConnection }} - scriptType: 'pscore' - scriptLocation: 'scriptPath' - scriptPath: 'eng/generate-appgw-cert.ps1' - ${{ if eq(parameters.Environment, 'Staging') }}: - arguments: >- - -DnsName "dnceng-managed-grafana-staging.${{ parameters.Location }}.cloudapp.azure.com" - -KeyVaultName "${{ parameters.GrafanaKeyVault }}" - -ResourceGroupName "${{ parameters.ResourceGroupName }}" - -Location "${{ parameters.Location }}" - ${{ if eq(parameters.Environment, 'Production') }}: - arguments: >- - -DnsName "dnceng-managed-grafana.${{ parameters.Location }}.cloudapp.azure.com" - -KeyVaultName "${{ parameters.GrafanaKeyVault }}" - -ResourceGroupName "${{ parameters.ResourceGroupName }}" - -Location "${{ parameters.Location }}" - - - task: AzureResourceManagerTemplateDeployment@3 - displayName: 'Deploy Application Gateway for Grafana' - inputs: - deploymentScope: 'Resource Group' - azureResourceManagerConnection: ${{ parameters.ServiceConnection }} - action: 'Create Or Update Resource Group' - resourceGroupName: ${{ parameters.ResourceGroupName }} - location: ${{ parameters.Location }} - templateLocation: 'Linked artifact' - csmFile: 'eng/deployment/azure-appgw-grafana.bicep' - overrideParameters: >- - -environment "${{ parameters.Environment }}" - -location "${{ parameters.Location }}" - -grafanaEndpoint "$(GrafanaEndpoint)" - -certSecretId "$(KeyVaultSecretId)" - -grafanaUserAssignedIdentityId "${{ parameters.GrafanaIdentityId }}" - deploymentMode: 'Incremental' - deploymentOutputs: 'AppGatewayOutputs' - - - task: AzureCLI@2 - displayName: 'Grant Application Gateway Access to Key Vault' - inputs: - azureSubscription: ${{ parameters.ServiceConnection }} - scriptType: 'pscore' - scriptLocation: 'inlineScript' - inlineScript: | - Write-Host "Granting Application Gateway managed identity access to Key Vault..." - - $outputs = '$(AppGatewayOutputs)' | ConvertFrom-Json - $appGwIdentity = $outputs.applicationGatewayIdentity.value - $kvName = "$(KeyVaultName)" - - Write-Host "Application Gateway Identity: $appGwIdentity" - Write-Host "Key Vault: $kvName" - - # Get the Key Vault resource ID - $rgName = "${{ parameters.ResourceGroupName }}" - $kvId = az keyvault show --name $kvName --resource-group $rgName --query "id" --output tsv - - Write-Host "Key Vault ID: $kvId" - - # Check if role assignment already exists - $existingAssignment = az role assignment list ` - --assignee $appGwIdentity ` - --scope $kvId ` - --role "Key Vault Secrets User" ` - --query "[0].id" ` - --output tsv - - if ($existingAssignment) { - Write-Host "āœ“ Application Gateway already has Key Vault Secrets User role" - } else { - Write-Host "Granting Key Vault Secrets User role (RBAC)..." - az role assignment create ` - --role "Key Vault Secrets User" ` - --assignee $appGwIdentity ` - --scope $kvId ` - --output none - - if ($LASTEXITCODE -eq 0) { - Write-Host "āœ“ Application Gateway can now access certificates from Key Vault" - } else { - Write-Error "Failed to grant Key Vault access" - exit 1 - } - } - - - - task: AzureCLI@2 - displayName: 'Display Custom Domain Information' - inputs: - azureSubscription: ${{ parameters.ServiceConnection }} - scriptType: 'pscore' - scriptLocation: 'inlineScript' - inlineScript: | - Write-Host "================================================" - Write-Host "APPLICATION GATEWAY CUSTOM DOMAIN DEPLOYED" - Write-Host "================================================" - - $outputs = '$(AppGatewayOutputs)' | ConvertFrom-Json - - $customDomain = $outputs.customDomainName.value - $customDomainUrl = $outputs.customDomainUrl.value - $accessUrl = $outputs.accessUrl.value - $publicIp = $outputs.publicIpAddress.value - $appGwName = $outputs.applicationGatewayName.value - - Write-Host "" - Write-Host "Environment: ${{ parameters.Environment }}" - Write-Host "Application Gateway: $appGwName" - Write-Host "Public IP Address: $publicIp" - Write-Host "Custom Domain: $customDomain" - Write-Host "" - Write-Host "================================================" - Write-Host "IMMEDIATE ACCESS (NO DNS SETUP REQUIRED)" - Write-Host "================================================" - Write-Host "" - Write-Host "šŸŽ‰ Your Grafana is immediately accessible at:" - Write-Host " $accessUrl" - Write-Host "" - Write-Host "================================================" - Write-Host "NEXT STEPS" - Write-Host "================================================" - Write-Host "1. Wait 2-5 minutes for Application Gateway to become healthy" - Write-Host "2. Access Grafana at: $accessUrl" - Write-Host "3. Accept the self-signed certificate warning in your browser" - Write-Host "4. HTTPS traffic is proxied to Grafana HTTPS backend" - Write-Host "5. Health probe monitors: $(GrafanaEndpointFull)/api/health" - Write-Host "" - Write-Host " Certificate managed in Key Vault: $(KeyVaultName)" - Write-Host " Thumbprint: $(CertificateThumbprint)" - Write-Host "" - Write-Host "The domain $customDomain is ready to use!" - Write-Host "" - Write-Host "================================================" - - - task: AzureCLI@2 - displayName: 'Verify Application Gateway Deployment' - inputs: - azureSubscription: ${{ parameters.ServiceConnection }} - scriptType: 'pscore' - scriptLocation: 'inlineScript' - inlineScript: | - Write-Host "Verifying Application Gateway deployment..." - - $outputs = '$(AppGatewayOutputs)' | ConvertFrom-Json - $appGwName = $outputs.applicationGatewayName.value - $customDomain = $outputs.customDomainName.value - - $maxAttempts = 10 - $attempt = 0 - $verified = $false - - while ($attempt -lt $maxAttempts -and -not $verified) { - $attempt++ - Write-Host "Verification attempt $attempt of $maxAttempts..." - - try { - $provisioningState = az network application-gateway show ` - --name "$appGwName" ` - --resource-group "${{ parameters.ResourceGroupName }}" ` - --query "provisioningState" ` - --output tsv - - Write-Host "Provisioning State: $provisioningState" - - if ($provisioningState -eq "Succeeded") { - Write-Host "āœ“ Application Gateway deployed successfully" - - # Check operational state - $operationalState = az network application-gateway show ` - --name "$appGwName" ` - --resource-group "${{ parameters.ResourceGroupName }}" ` - --query "operationalState" ` - --output tsv - - Write-Host "Operational State: $operationalState" - - if ($operationalState -eq "Running") { - Write-Host "āœ“ Application Gateway is running" - $verified = $true - } else { - Write-Host "Waiting for Application Gateway to start..." - Start-Sleep -Seconds 15 - } - } else { - Write-Host "Application Gateway still provisioning..." - Start-Sleep -Seconds 15 - } - } catch { - Write-Warning "Verification attempt $attempt failed: $_" - Start-Sleep -Seconds 15 - } - } - - if (-not $verified) { - Write-Error "Failed to verify Application Gateway deployment after $maxAttempts attempts" - exit 1 - } - - Write-Host "āœ“ Application Gateway deployment verified successfully" - Write-Host "" - Write-Host "šŸŽ‰ SUCCESS! Grafana is now accessible at: https://$customDomain" - Write-Host "" - Write-Host "Note: Backend health probes may take an additional 2-5 minutes to show as healthy" - Write-Host "You can check status with:" - Write-Host "az network application-gateway show-backend-health \" - Write-Host " --name $appGwName \" - Write-Host " --resource-group ${{ parameters.ResourceGroupName }}" \ No newline at end of file From 98f709a45c367af22da499c6a6ba908a5dde1adf Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Fri, 21 Nov 2025 16:21:21 -0800 Subject: [PATCH 101/133] Migrate alert rules to Azure Managed Grafana unified alerting format - Convert 30 alert rules (15 Staging + 15 Production) to Grafana unified alerting - Update alert structure: conditions, data queries, and thresholds - Configure notification settings with contact points - Set folderUID to 'arcade-services' for all alerts - Add proper intervalSeconds and rule group configurations --- .../Production/cores-consumption.alert.json | 4 +- ...otneteng-status-failed-requests.alert.json | 16 +----- .../helix-api-availability.alert.json | 3 - ...helix-api-average-response-time.alert.json | 3 - ...elix-autoscaler-service-stopped.alert.json | 3 - .../pcs-background-worker-stopped.alert.json | 34 +----------- ...ontainer-job-execution-failures.alert.json | 3 +- .../pcs-disk-space-issues.alert.json | 12 ---- .../Production/pcs-exceptions-high.alert.json | 11 ---- .../pcs-git-push-success-rate.alert.json | 12 ---- .../pcs-work-item-success-rate.alert.json | 55 ++++--------------- .../Production/quota-eastus.alert.json | 31 +++++++++-- .../Production/quota-westus.alert.json | 2 +- .../Production/quota-westus2.alert.json | 2 +- .../source-dot-net-availability.alert.json | 3 - .../Staging/cores-consumption.alert.json | 4 +- ...otneteng-status-failed-requests.alert.json | 16 +----- .../Staging/helix-api-availability.alert.json | 3 - ...helix-api-average-response-time.alert.json | 3 - ...elix-autoscaler-service-stopped.alert.json | 3 - .../pcs-background-worker-stopped.alert.json | 34 +----------- ...ontainer-job-execution-failures.alert.json | 3 +- .../Staging/pcs-disk-space-issues.alert.json | 12 ---- .../Staging/pcs-exceptions-high.alert.json | 11 ---- .../pcs-git-push-success-rate.alert.json | 12 ---- .../pcs-work-item-success-rate.alert.json | 26 --------- .../Staging/quota-eastus.alert.json | 4 +- .../Staging/quota-westus.alert.json | 2 +- .../Staging/quota-westus2.alert.json | 2 +- .../source-dot-net-availability.alert.json | 3 - 30 files changed, 57 insertions(+), 275 deletions(-) diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/cores-consumption.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/cores-consumption.alert.json index cfb495d83..3450d1b7b 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/cores-consumption.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/cores-consumption.alert.json @@ -19,6 +19,7 @@ "workspace": "[parameter(default-workspace-resourcepath)]" }, "azureMonitor": { + "dimensionFilter": "*", "dimensionFilters": [], "timeGrain": "auto", "top": "10" @@ -105,9 +106,6 @@ "isPaused": false, "notification_settings": { "receiver": ".NET Status Alert", - "group_by": [ - "alertname" - ], "group_wait": "5m", "repeat_interval": "4h" } diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/dotneteng-status-failed-requests.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/dotneteng-status-failed-requests.alert.json index f0d19fefe..e8b66ef92 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/dotneteng-status-failed-requests.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/dotneteng-status-failed-requests.alert.json @@ -18,19 +18,10 @@ "workspace": "/subscriptions/68672ab8-de0c-40f1-8d1b-ffb20bd62c0f/resourcegroups/defaultresourcegroup-eus/providers/microsoft.operationalinsights/workspaces/defaultworkspace-68672ab8-de0c-40f1-8d1b-ffb20bd62c0f-eus" }, "azureMonitor": { + "dimensionFilter": "*", "dimensionFilters": [], - "metricNamespace": "microsoft.insights/components", - "region": "eastus", - "resources": [ - { - "metricNamespace": "microsoft.insights/components", - "region": "eastus", - "resourceGroup": "monitoring", - "resourceName": "DotNetEng-Status-Prod", - "subscription": "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f" - } - ], - "timeGrain": "auto" + "timeGrain": "auto", + "top": "10" }, "datasource": { "type": "grafana-azure-monitor-datasource", @@ -133,7 +124,6 @@ "isPaused": false, "notification_settings": { "receiver": ".NET Status Alert", - "group_by": ["alertname"], "group_wait": "5m", "repeat_interval": "4h" } diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/helix-api-availability.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/helix-api-availability.alert.json index 991b04e21..85e307d34 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/helix-api-availability.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/helix-api-availability.alert.json @@ -153,9 +153,6 @@ "isPaused": false, "notification_settings": { "receiver": ".NET Status Alert", - "group_by": [ - "alertname" - ], "group_wait": "5m", "repeat_interval": "4h" } diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/helix-api-average-response-time.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/helix-api-average-response-time.alert.json index dde9851b2..b3cfd67b6 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/helix-api-average-response-time.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/helix-api-average-response-time.alert.json @@ -142,9 +142,6 @@ "isPaused": false, "notification_settings": { "receiver": ".NET Status Alert", - "group_by": [ - "alertname" - ], "group_wait": "5m", "repeat_interval": "4h" } diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/helix-autoscaler-service-stopped.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/helix-autoscaler-service-stopped.alert.json index 52876deb1..cb3c942e3 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/helix-autoscaler-service-stopped.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/helix-autoscaler-service-stopped.alert.json @@ -150,9 +150,6 @@ "isPaused": false, "notification_settings": { "receiver": ".NET Status Alert", - "group_by": [ - "alertname" - ], "group_wait": "5m", "repeat_interval": "4h" } diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-background-worker-stopped.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-background-worker-stopped.alert.json index 616f1afc6..082c7bfbc 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-background-worker-stopped.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-background-worker-stopped.alert.json @@ -23,17 +23,6 @@ }, "azureMonitor": { "dimensionFilters": [], - "metricNamespace": "microsoft.insights/components", - "region": "westus2", - "resources": [ - { - "metricNamespace": "microsoft.insights/components", - "region": "westus2", - "resourceGroup": "product-construction-service", - "resourceName": "product-construction-service-ai-int", - "subscription": "[parameter(dotnet-product-construction-services-subscriptionid)]" - } - ], "timeGrain": "auto" }, "datasource": { @@ -42,11 +31,7 @@ }, "queryType": "Azure Log Analytics", "refId": "A", - "subscription": "[parameter(dotnet-eng-appinsights-subscriptionid)]", - "subscriptions": [ - "fbd6122a-9ad3-42e4-976e-bccb82486856", - "e6b5f9f5-0ca4-4351-879b-014d78400ec2" - ] + "subscription": "[parameter(dotnet-eng-appinsights-subscriptionid)]" } }, { @@ -69,16 +54,6 @@ }, "azureMonitor": { "dimensionFilters": [], - "hide": false, - "metricNamespace": "microsoft.insights/components", - "resources": [ - { - "metricNamespace": "microsoft.insights/components", - "resourceGroup": "product-construction-service", - "resourceName": "product-construction-service-ai-int", - "subscription": "[parameter(dotnet-product-construction-services-subscriptionid)]" - } - ], "timeGrain": "auto" }, "datasource": { @@ -88,11 +63,7 @@ "hide": false, "queryType": "Azure Log Analytics", "refId": "B", - "subscription": "[dotnet-eng-appinsights-subscriptionid)]", - "subscriptions": [ - "fbd6122a-9ad3-42e4-976e-bccb82486856", - "e6b5f9f5-0ca4-4351-879b-014d78400ec2" - ] + "subscription": "[parameter(dotnet-eng-appinsights-subscriptionid)]" } }, { @@ -179,7 +150,6 @@ "isPaused": false, "notification_settings": { "receiver": ".NET Status Alert", - "group_by": ["alertname"], "group_wait": "5m", "repeat_interval": "4h" } diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-container-job-execution-failures.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-container-job-execution-failures.alert.json index e3bf54bd1..1a18b0ff1 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-container-job-execution-failures.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-container-job-execution-failures.alert.json @@ -29,7 +29,7 @@ { "metricNamespace": "Microsoft.OperationalInsights/workspaces", "region": "westus2", - "resourceGroup": "product-construction-service", + "resourceGroup": "[parameter(product-construction-service-resourcegroup)]", "resourceName": "[parameter(product-construction-service-workspace-resourcename)]", "subscription": "[parameter(dotnet-product-construction-services-subscriptionid)]" } @@ -133,7 +133,6 @@ "isPaused": false, "notification_settings": { "receiver": ".NET Status Alert (no image, 12h reminder)", - "group_by": ["alertname"], "group_wait": "5m", "repeat_interval": "12h" } diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-disk-space-issues.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-disk-space-issues.alert.json index 04020ce81..12adcd112 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-disk-space-issues.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-disk-space-issues.alert.json @@ -23,17 +23,6 @@ }, "azureMonitor": { "dimensionFilters": [], - "metricNamespace": "microsoft.insights/components", - "region": "westus2", - "resources": [ - { - "metricNamespace": "microsoft.insights/components", - "region": "westus2", - "resourceGroup": "product-construction-service", - "resourceName": "product-construction-service-ai-int", - "subscription": "[parameter(dotnet-product-construction-services-subscriptionid)]" - } - ], "timeGrain": "auto" }, "datasource": { @@ -133,7 +122,6 @@ "isPaused": false, "notification_settings": { "receiver": ".NET Status Alert", - "group_by": ["alertname"], "group_wait": "5m", "repeat_interval": "4h" } diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-exceptions-high.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-exceptions-high.alert.json index 7e3902030..9a9cc78e7 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-exceptions-high.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-exceptions-high.alert.json @@ -23,16 +23,6 @@ }, "azureMonitor": { "dimensionFilters": [], - "hide": false, - "metricNamespace": "microsoft.insights/components", - "resources": [ - { - "metricNamespace": "microsoft.insights/components", - "resourceGroup": "product-construction-service", - "resourceName": "product-construction-service-ai-int", - "subscription": "[parameter(dotnet-product-construction-services-subscriptionid)]" - } - ], "timeGrain": "auto" }, "datasource": { @@ -132,7 +122,6 @@ "isPaused": false, "notification_settings": { "receiver": ".NET Status Alert", - "group_by": ["alertname"], "group_wait": "5m", "repeat_interval": "4h" } diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-git-push-success-rate.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-git-push-success-rate.alert.json index 457990212..9fa19d01b 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-git-push-success-rate.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-git-push-success-rate.alert.json @@ -19,17 +19,6 @@ }, "azureMonitor": { "dimensionFilters": [], - "metricNamespace": "microsoft.insights/components", - "region": "westus2", - "resources": [ - { - "metricNamespace": "microsoft.insights/components", - "region": "westus2", - "resourceGroup": "product-construction-service", - "resourceName": "product-construction-service-ai-int", - "subscription": "[parameter(dotnet-product-construction-services-subscriptionid)]" - } - ], "timeGrain": "auto" }, "datasource": { @@ -133,7 +122,6 @@ "isPaused": false, "notification_settings": { "receiver": ".NET Status Alert", - "group_by": ["alertname"], "group_wait": "5m", "repeat_interval": "4h" } diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-work-item-success-rate.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-work-item-success-rate.alert.json index 1538aafdb..6e11755a7 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-work-item-success-rate.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-work-item-success-rate.alert.json @@ -23,17 +23,6 @@ }, "azureMonitor": { "dimensionFilters": [], - "metricNamespace": "microsoft.insights/components", - "region": "westus2", - "resources": [ - { - "metricNamespace": "microsoft.insights/components", - "region": "westus2", - "resourceGroup": "product-construction-service", - "resourceName": "product-construction-service-ai-int", - "subscription": "[parameter(dotnet-product-construction-services-subscriptionid)]" - } - ], "timeGrain": "auto" }, "datasource": { @@ -46,8 +35,7 @@ "subscriptions": [ "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", "cab65fc3-d077-467d-931f-3932eabf36d3" - ], - "intervalMs": 300000 + ] } }, { @@ -66,18 +54,6 @@ }, "azureMonitor": { "dimensionFilters": [], - "hide": false, - "metricNamespace": "microsoft.insights/components", - "region": "westus2", - "resources": [ - { - "metricNamespace": "microsoft.insights/components", - "region": "westus2", - "resourceGroup": "product-construction-service", - "resourceName": "product-construction-service-ai-int", - "subscription": "[parameter(dotnet-product-construction-services-subscriptionid)]" - } - ], "timeGrain": "auto" }, "datasource": { @@ -91,8 +67,7 @@ "subscriptions": [ "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", "cab65fc3-d077-467d-931f-3932eabf36d3" - ], - "intervalMs": 300000 + ] }, "relativeTimeRange": { "from": 86400, @@ -103,6 +78,10 @@ "refId": "C", "queryType": "", "datasourceUid": "-100", + "relativeTimeRange": { + "from": 300, + "to": 0 + }, "model": { "conditions": [ { @@ -130,12 +109,7 @@ "expression": "B", "reducer": "mean", "refId": "C", - "type": "reduce", - "intervalMs": 300000 - }, - "relativeTimeRange": { - "from": 86400, - "to": 0 + "type": "reduce" } }, { @@ -168,33 +142,26 @@ }, "expression": "C", "refId": "D", - "type": "threshold", - "intervalMs": 300000 - }, - "relativeTimeRange": { - "from": 86400, - "to": 0 + "type": "threshold" } } ], "noDataState": "KeepLast", "execErrState": "KeepLast", "for": "5m", + "frequency": "1m", "annotations": { - "description": "[!IMPORTANT]\\n[Description and instructions for this alert](https://dev.azure.com/dnceng/internal/_wiki/wikis/DNCEng%20Services%20Wiki/1310/-Alert-PCS-Work-Item-Success-Rate-alert)\\n\\nThe PCS background work items started to fail frequently.\\n\\n@dotnet/prodconsvcs" + "description": "[!IMPORTANT]\n[Description and instructions for this alert](https://dev.azure.com/dnceng/internal/_wiki/wikis/DNCEng%20Services%20Wiki/1310/-Alert-PCS-Work-Item-Success-Rate-alert)\n\nThe PCS background work items started to fail frequently.\n\n@dotnet/prodconsvcs" }, "labels": { "NotificationId": "d71fe025a8954b6cad9866354ca041ee" }, "folderUID": "arcade-services", "ruleGroup": "PCS Alerts", - "intervalSeconds": 60, + "intervalMs": 900000, "isPaused": false, "notification_settings": { "receiver": ".NET Status Alert", - "group_by": [ - "alertname" - ], "group_wait": "5m", "repeat_interval": "4h" } diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/quota-eastus.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/quota-eastus.alert.json index 383921ae5..0c8dd6207 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/quota-eastus.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/quota-eastus.alert.json @@ -15,8 +15,10 @@ "workspace": "[parameter(default-workspace-resourcepath)]" }, "azureMonitor": { + "dimensionFilter": "*", "dimensionFilters": [], - "timeGrain": "auto" + "timeGrain": "auto", + "top": "10" }, "datasource": { "type": "grafana-azure-monitor-datasource", @@ -31,7 +33,7 @@ ] }, "relativeTimeRange": { - "from": 300, + "from": 86400, "to": 0 } }, @@ -40,6 +42,25 @@ "queryType": "", "datasourceUid": "-100", "model": { + "conditions": [ + { + "evaluator": { + "params": [], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "A", + "5m", + "now" + ] + }, + "type": "query" + } + ], "datasource": { "type": "__expr__", "uid": "-100" @@ -69,7 +90,7 @@ "type": "and" }, "query": { - "params": ["C"] + "params": ["B"] }, "type": "query" } @@ -87,6 +108,7 @@ "noDataState": "KeepLast", "execErrState": "KeepLast", "for": "5m", + "frequency": "1m", "annotations": { "description": "An Azure Resource Quota is nearing its limit in region eastus!" }, @@ -95,11 +117,10 @@ }, "folderUID": "arcade-services", "ruleGroup": "Azure Quota Alerts", - "intervalSeconds": 60, + "intervalMs": 900000, "isPaused": false, "notification_settings": { "receiver": ".NET Status Alert", - "group_by": ["alertname"], "group_wait": "5m", "repeat_interval": "4h" } diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/quota-westus.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/quota-westus.alert.json index 1b9aab107..ef697622f 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/quota-westus.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/quota-westus.alert.json @@ -15,6 +15,7 @@ "workspace": "[parameter(default-workspace-resourcepath)]" }, "azureMonitor": { + "dimensionFilter": "*", "dimensionFilters": [], "timeGrain": "auto", "top": "10" @@ -101,7 +102,6 @@ "isPaused": false, "notification_settings": { "receiver": ".NET Status Alert", - "group_by": ["alertname"], "group_wait": "5m", "repeat_interval": "4h" } diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/quota-westus2.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/quota-westus2.alert.json index 08da0b4f5..09a010687 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/quota-westus2.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/quota-westus2.alert.json @@ -15,6 +15,7 @@ "workspace": "[parameter(default-workspace-resourcepath)]" }, "azureMonitor": { + "dimensionFilter": "*", "dimensionFilters": [], "timeGrain": "auto", "top": "10" @@ -101,7 +102,6 @@ "isPaused": false, "notification_settings": { "receiver": ".NET Status Alert", - "group_by": ["alertname"], "group_wait": "5m", "repeat_interval": "4h" } diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/source-dot-net-availability.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/source-dot-net-availability.alert.json index 087546715..11ff1f0f1 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/source-dot-net-availability.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/source-dot-net-availability.alert.json @@ -147,9 +147,6 @@ "isPaused": false, "notification_settings": { "receiver": ".NET Status Alert", - "group_by": [ - "alertname" - ], "group_wait": "5m", "repeat_interval": "4h" } diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/cores-consumption.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/cores-consumption.alert.json index cfb495d83..3450d1b7b 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/cores-consumption.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/cores-consumption.alert.json @@ -19,6 +19,7 @@ "workspace": "[parameter(default-workspace-resourcepath)]" }, "azureMonitor": { + "dimensionFilter": "*", "dimensionFilters": [], "timeGrain": "auto", "top": "10" @@ -105,9 +106,6 @@ "isPaused": false, "notification_settings": { "receiver": ".NET Status Alert", - "group_by": [ - "alertname" - ], "group_wait": "5m", "repeat_interval": "4h" } diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/dotneteng-status-failed-requests.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/dotneteng-status-failed-requests.alert.json index f0d19fefe..e8b66ef92 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/dotneteng-status-failed-requests.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/dotneteng-status-failed-requests.alert.json @@ -18,19 +18,10 @@ "workspace": "/subscriptions/68672ab8-de0c-40f1-8d1b-ffb20bd62c0f/resourcegroups/defaultresourcegroup-eus/providers/microsoft.operationalinsights/workspaces/defaultworkspace-68672ab8-de0c-40f1-8d1b-ffb20bd62c0f-eus" }, "azureMonitor": { + "dimensionFilter": "*", "dimensionFilters": [], - "metricNamespace": "microsoft.insights/components", - "region": "eastus", - "resources": [ - { - "metricNamespace": "microsoft.insights/components", - "region": "eastus", - "resourceGroup": "monitoring", - "resourceName": "DotNetEng-Status-Prod", - "subscription": "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f" - } - ], - "timeGrain": "auto" + "timeGrain": "auto", + "top": "10" }, "datasource": { "type": "grafana-azure-monitor-datasource", @@ -133,7 +124,6 @@ "isPaused": false, "notification_settings": { "receiver": ".NET Status Alert", - "group_by": ["alertname"], "group_wait": "5m", "repeat_interval": "4h" } diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/helix-api-availability.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/helix-api-availability.alert.json index 991b04e21..85e307d34 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/helix-api-availability.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/helix-api-availability.alert.json @@ -153,9 +153,6 @@ "isPaused": false, "notification_settings": { "receiver": ".NET Status Alert", - "group_by": [ - "alertname" - ], "group_wait": "5m", "repeat_interval": "4h" } diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/helix-api-average-response-time.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/helix-api-average-response-time.alert.json index dde9851b2..b3cfd67b6 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/helix-api-average-response-time.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/helix-api-average-response-time.alert.json @@ -142,9 +142,6 @@ "isPaused": false, "notification_settings": { "receiver": ".NET Status Alert", - "group_by": [ - "alertname" - ], "group_wait": "5m", "repeat_interval": "4h" } diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/helix-autoscaler-service-stopped.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/helix-autoscaler-service-stopped.alert.json index 52876deb1..cb3c942e3 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/helix-autoscaler-service-stopped.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/helix-autoscaler-service-stopped.alert.json @@ -150,9 +150,6 @@ "isPaused": false, "notification_settings": { "receiver": ".NET Status Alert", - "group_by": [ - "alertname" - ], "group_wait": "5m", "repeat_interval": "4h" } diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-background-worker-stopped.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-background-worker-stopped.alert.json index 616f1afc6..082c7bfbc 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-background-worker-stopped.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-background-worker-stopped.alert.json @@ -23,17 +23,6 @@ }, "azureMonitor": { "dimensionFilters": [], - "metricNamespace": "microsoft.insights/components", - "region": "westus2", - "resources": [ - { - "metricNamespace": "microsoft.insights/components", - "region": "westus2", - "resourceGroup": "product-construction-service", - "resourceName": "product-construction-service-ai-int", - "subscription": "[parameter(dotnet-product-construction-services-subscriptionid)]" - } - ], "timeGrain": "auto" }, "datasource": { @@ -42,11 +31,7 @@ }, "queryType": "Azure Log Analytics", "refId": "A", - "subscription": "[parameter(dotnet-eng-appinsights-subscriptionid)]", - "subscriptions": [ - "fbd6122a-9ad3-42e4-976e-bccb82486856", - "e6b5f9f5-0ca4-4351-879b-014d78400ec2" - ] + "subscription": "[parameter(dotnet-eng-appinsights-subscriptionid)]" } }, { @@ -69,16 +54,6 @@ }, "azureMonitor": { "dimensionFilters": [], - "hide": false, - "metricNamespace": "microsoft.insights/components", - "resources": [ - { - "metricNamespace": "microsoft.insights/components", - "resourceGroup": "product-construction-service", - "resourceName": "product-construction-service-ai-int", - "subscription": "[parameter(dotnet-product-construction-services-subscriptionid)]" - } - ], "timeGrain": "auto" }, "datasource": { @@ -88,11 +63,7 @@ "hide": false, "queryType": "Azure Log Analytics", "refId": "B", - "subscription": "[dotnet-eng-appinsights-subscriptionid)]", - "subscriptions": [ - "fbd6122a-9ad3-42e4-976e-bccb82486856", - "e6b5f9f5-0ca4-4351-879b-014d78400ec2" - ] + "subscription": "[parameter(dotnet-eng-appinsights-subscriptionid)]" } }, { @@ -179,7 +150,6 @@ "isPaused": false, "notification_settings": { "receiver": ".NET Status Alert", - "group_by": ["alertname"], "group_wait": "5m", "repeat_interval": "4h" } diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-container-job-execution-failures.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-container-job-execution-failures.alert.json index e3bf54bd1..1a18b0ff1 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-container-job-execution-failures.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-container-job-execution-failures.alert.json @@ -29,7 +29,7 @@ { "metricNamespace": "Microsoft.OperationalInsights/workspaces", "region": "westus2", - "resourceGroup": "product-construction-service", + "resourceGroup": "[parameter(product-construction-service-resourcegroup)]", "resourceName": "[parameter(product-construction-service-workspace-resourcename)]", "subscription": "[parameter(dotnet-product-construction-services-subscriptionid)]" } @@ -133,7 +133,6 @@ "isPaused": false, "notification_settings": { "receiver": ".NET Status Alert (no image, 12h reminder)", - "group_by": ["alertname"], "group_wait": "5m", "repeat_interval": "12h" } diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-disk-space-issues.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-disk-space-issues.alert.json index 04020ce81..12adcd112 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-disk-space-issues.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-disk-space-issues.alert.json @@ -23,17 +23,6 @@ }, "azureMonitor": { "dimensionFilters": [], - "metricNamespace": "microsoft.insights/components", - "region": "westus2", - "resources": [ - { - "metricNamespace": "microsoft.insights/components", - "region": "westus2", - "resourceGroup": "product-construction-service", - "resourceName": "product-construction-service-ai-int", - "subscription": "[parameter(dotnet-product-construction-services-subscriptionid)]" - } - ], "timeGrain": "auto" }, "datasource": { @@ -133,7 +122,6 @@ "isPaused": false, "notification_settings": { "receiver": ".NET Status Alert", - "group_by": ["alertname"], "group_wait": "5m", "repeat_interval": "4h" } diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-exceptions-high.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-exceptions-high.alert.json index 7e3902030..9a9cc78e7 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-exceptions-high.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-exceptions-high.alert.json @@ -23,16 +23,6 @@ }, "azureMonitor": { "dimensionFilters": [], - "hide": false, - "metricNamespace": "microsoft.insights/components", - "resources": [ - { - "metricNamespace": "microsoft.insights/components", - "resourceGroup": "product-construction-service", - "resourceName": "product-construction-service-ai-int", - "subscription": "[parameter(dotnet-product-construction-services-subscriptionid)]" - } - ], "timeGrain": "auto" }, "datasource": { @@ -132,7 +122,6 @@ "isPaused": false, "notification_settings": { "receiver": ".NET Status Alert", - "group_by": ["alertname"], "group_wait": "5m", "repeat_interval": "4h" } diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-git-push-success-rate.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-git-push-success-rate.alert.json index 457990212..9fa19d01b 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-git-push-success-rate.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-git-push-success-rate.alert.json @@ -19,17 +19,6 @@ }, "azureMonitor": { "dimensionFilters": [], - "metricNamespace": "microsoft.insights/components", - "region": "westus2", - "resources": [ - { - "metricNamespace": "microsoft.insights/components", - "region": "westus2", - "resourceGroup": "product-construction-service", - "resourceName": "product-construction-service-ai-int", - "subscription": "[parameter(dotnet-product-construction-services-subscriptionid)]" - } - ], "timeGrain": "auto" }, "datasource": { @@ -133,7 +122,6 @@ "isPaused": false, "notification_settings": { "receiver": ".NET Status Alert", - "group_by": ["alertname"], "group_wait": "5m", "repeat_interval": "4h" } diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-work-item-success-rate.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-work-item-success-rate.alert.json index acba7015a..6e11755a7 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-work-item-success-rate.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-work-item-success-rate.alert.json @@ -23,17 +23,6 @@ }, "azureMonitor": { "dimensionFilters": [], - "metricNamespace": "microsoft.insights/components", - "region": "westus2", - "resources": [ - { - "metricNamespace": "microsoft.insights/components", - "region": "westus2", - "resourceGroup": "product-construction-service", - "resourceName": "product-construction-service-ai-int", - "subscription": "[parameter(dotnet-product-construction-services-subscriptionid)]" - } - ], "timeGrain": "auto" }, "datasource": { @@ -65,18 +54,6 @@ }, "azureMonitor": { "dimensionFilters": [], - "hide": false, - "metricNamespace": "microsoft.insights/components", - "region": "westus2", - "resources": [ - { - "metricNamespace": "microsoft.insights/components", - "region": "westus2", - "resourceGroup": "product-construction-service", - "resourceName": "product-construction-service-ai-int", - "subscription": "[parameter(dotnet-product-construction-services-subscriptionid)]" - } - ], "timeGrain": "auto" }, "datasource": { @@ -185,9 +162,6 @@ "isPaused": false, "notification_settings": { "receiver": ".NET Status Alert", - "group_by": [ - "alertname" - ], "group_wait": "5m", "repeat_interval": "4h" } diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/quota-eastus.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/quota-eastus.alert.json index 752a9644f..0c8dd6207 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/quota-eastus.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/quota-eastus.alert.json @@ -15,6 +15,7 @@ "workspace": "[parameter(default-workspace-resourcepath)]" }, "azureMonitor": { + "dimensionFilter": "*", "dimensionFilters": [], "timeGrain": "auto", "top": "10" @@ -53,7 +54,7 @@ "query": { "params": [ "A", - "30m", + "5m", "now" ] }, @@ -120,7 +121,6 @@ "isPaused": false, "notification_settings": { "receiver": ".NET Status Alert", - "group_by": ["alertname"], "group_wait": "5m", "repeat_interval": "4h" } diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/quota-westus.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/quota-westus.alert.json index 1b9aab107..ef697622f 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/quota-westus.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/quota-westus.alert.json @@ -15,6 +15,7 @@ "workspace": "[parameter(default-workspace-resourcepath)]" }, "azureMonitor": { + "dimensionFilter": "*", "dimensionFilters": [], "timeGrain": "auto", "top": "10" @@ -101,7 +102,6 @@ "isPaused": false, "notification_settings": { "receiver": ".NET Status Alert", - "group_by": ["alertname"], "group_wait": "5m", "repeat_interval": "4h" } diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/quota-westus2.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/quota-westus2.alert.json index 08da0b4f5..09a010687 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/quota-westus2.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/quota-westus2.alert.json @@ -15,6 +15,7 @@ "workspace": "[parameter(default-workspace-resourcepath)]" }, "azureMonitor": { + "dimensionFilter": "*", "dimensionFilters": [], "timeGrain": "auto", "top": "10" @@ -101,7 +102,6 @@ "isPaused": false, "notification_settings": { "receiver": ".NET Status Alert", - "group_by": ["alertname"], "group_wait": "5m", "repeat_interval": "4h" } diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/source-dot-net-availability.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/source-dot-net-availability.alert.json index 087546715..11ff1f0f1 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/source-dot-net-availability.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/source-dot-net-availability.alert.json @@ -147,9 +147,6 @@ "isPaused": false, "notification_settings": { "receiver": ".NET Status Alert", - "group_by": [ - "alertname" - ], "group_wait": "5m", "repeat_interval": "4h" } From f09179762b49f1cb50e5d41374f4543bb7496ca1 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Sun, 23 Nov 2025 01:03:10 -0800 Subject: [PATCH 102/133] grant grafa MI access to engineeringdata --- eng/provision-grafana.yaml | 131 ++++++++++++++++++ .../dashboard/general/home.dashboard.json | 2 +- 2 files changed, 132 insertions(+), 1 deletion(-) diff --git a/eng/provision-grafana.yaml b/eng/provision-grafana.yaml index 2aa67a0d9..5a776504b 100644 --- a/eng/provision-grafana.yaml +++ b/eng/provision-grafana.yaml @@ -283,6 +283,137 @@ jobs: Write-Host "ā„¹ļø RBAC propagation can take 2-5 minutes for Azure Monitor queries to work" Write-Host "" + - task: AzureCLI@2 + displayName: 'Grant Grafana Identity Kusto Database Access' + inputs: + azureSubscription: '${{ parameters.ServiceConnectionName }}' + scriptType: 'pscore' + scriptLocation: 'inlineScript' + inlineScript: | + Write-Host "==========================================" + Write-Host "Granting Kusto Database Access to Grafana Identity" + Write-Host "==========================================" + Write-Host "" + + $workspaceName = "${{ parameters.GrafanaWorkspaceName }}" + $rgName = "${{ parameters.GrafanaResourceGroup }}" + $environment = "${{ parameters.DeploymentEnvironment }}" + + # Get the user-assigned managed identity + $managedIdentityName = if ($environment -eq 'Production') { 'dnceng-managed-grafana' } else { 'dnceng-managed-grafana-staging' } + Write-Host "Retrieving managed identity: $managedIdentityName" + + $identity = az identity show --name $managedIdentityName --resource-group $rgName --query '{principalId:principalId, clientId:clientId}' --output json | ConvertFrom-Json + + if (-not $identity) { + Write-Error "Failed to retrieve managed identity: $managedIdentityName" + exit 1 + } + + $principalId = $identity.principalId + $clientId = $identity.clientId + + Write-Host "āœ“ Managed Identity: $managedIdentityName" + Write-Host "āœ“ Principal ID: $principalId" + Write-Host "āœ“ Client ID: $clientId" + Write-Host "" + + # Define Kusto clusters and databases based on environment + $kustoConfig = @() + + if ($environment -eq "Staging") { + $kustoConfig += @{ + ClusterName = "engdata" + ClusterUri = "https://engdata.westus2.kusto.windows.net" + Database = "engineeringdata" + ResourceGroup = "helixstagingkusto" + } + } else { + $kustoConfig += @{ + ClusterName = "engsrvprod" + ClusterUri = "https://engsrvprod.westus.kusto.windows.net" + Database = "engineeringdata" + ResourceGroup = "helixprodkusto" + } + } + + Write-Host "Granting database viewer permissions on Kusto clusters..." + Write-Host "" + + $successCount = 0 + $failCount = 0 + + foreach ($cluster in $kustoConfig) { + Write-Host "Processing: $($cluster.ClusterName) - $($cluster.Database)" + + # Construct the AAD principal string for Kusto + $aadPrincipal = "aadapp=$clientId" + + # Create the Kusto command to add database viewer + $kustoCommand = ".add database ['$($cluster.Database)'] viewers ('$aadPrincipal') 'Grafana Managed Identity - $managedIdentityName'" + + Write-Host " Executing Kusto command..." + Write-Host " Command: $kustoCommand" + + # Execute the command using Azure CLI + try { + $result = az kusto script create ` + --cluster-name $cluster.ClusterName ` + --database-name $cluster.Database ` + --resource-group $cluster.ResourceGroup ` + --script-content $kustoCommand ` + --name "grant-grafana-access-$(Get-Date -Format 'yyyyMMddHHmmss')" ` + --force-update-tag "$(Get-Date -Format 'yyyyMMddHHmmss')" ` + --continue-on-errors false ` + --output none 2>&1 + + if ($LASTEXITCODE -eq 0) { + Write-Host " āœ“ Database viewer permission granted successfully" + $successCount++ + } else { + Write-Warning " ⚠ Failed to grant database viewer permission" + Write-Warning " Error: $result" + Write-Host " ā„¹ļø Attempting alternative method using principal-assignment..." + + # Alternative: Use principal-assignment API + try { + az kusto database-principal-assignment create ` + --cluster-name $cluster.ClusterName ` + --database-name $cluster.Database ` + --principal-assignment-name "grafana-$environment-viewer" ` + --principal-id $principalId ` + --principal-type App ` + --role Viewer ` + --resource-group $cluster.ResourceGroup ` + --tenant-id "72f988bf-86f1-41af-91ab-2d7cd011db47" ` + --output none 2>&1 + + if ($LASTEXITCODE -eq 0) { + Write-Host " āœ“ Database viewer permission granted via principal-assignment" + $successCount++ + } else { + Write-Error " āœ— Both methods failed to grant database access" + $failCount++ + } + } catch { + Write-Error " āœ— Both methods failed to grant database access" + Write-Error " Exception: $($_.Exception.Message)" + $failCount++ + } + } + } catch { + Write-Error " āœ— Failed to execute Kusto command" + Write-Error " Exception: $($_.Exception.Message)" + $failCount++ + } + + Write-Host "" + } + + Write-Host "" + Write-Host "ā„¹ļø Kusto permission changes are effective immediately" + Write-Host "" + - task: AzureCLI@2 displayName: 'Install Azure Managed Grafana Extension' inputs: diff --git a/src/Monitoring/Monitoring.ArcadeServices/dashboard/general/home.dashboard.json b/src/Monitoring/Monitoring.ArcadeServices/dashboard/general/home.dashboard.json index 811caae0a..18a68c6ac 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/dashboard/general/home.dashboard.json +++ b/src/Monitoring/Monitoring.ArcadeServices/dashboard/general/home.dashboard.json @@ -69,7 +69,7 @@ }, "id": 7, "options": { - "folderId": 17, + "folderId": 92, "maxItems": 10, "query": "", "showHeadings": false, From e4a35f5377ebf99fe25c3096ce96da990546f4f9 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Sun, 23 Nov 2025 01:27:44 -0800 Subject: [PATCH 103/133] grant grafa MI access to engineeringdata --- eng/provision-grafana.yaml | 92 +++++++++++++++++++------------------- 1 file changed, 47 insertions(+), 45 deletions(-) diff --git a/eng/provision-grafana.yaml b/eng/provision-grafana.yaml index 5a776504b..6c90afea5 100644 --- a/eng/provision-grafana.yaml +++ b/eng/provision-grafana.yaml @@ -327,6 +327,7 @@ jobs: ClusterUri = "https://engdata.westus2.kusto.windows.net" Database = "engineeringdata" ResourceGroup = "helixstagingkusto" + SubscriptionId = "cab65fc3-d077-467d-931f-3932eabf36d3" # HelixStaging subscription } } else { $kustoConfig += @{ @@ -334,6 +335,7 @@ jobs: ClusterUri = "https://engsrvprod.westus.kusto.windows.net" Database = "engineeringdata" ResourceGroup = "helixprodkusto" + SubscriptionId = "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f" # Helix production subscription } } @@ -345,71 +347,71 @@ jobs: foreach ($cluster in $kustoConfig) { Write-Host "Processing: $($cluster.ClusterName) - $($cluster.Database)" + Write-Host " Resource Group: $($cluster.ResourceGroup)" + Write-Host " Subscription: $($cluster.SubscriptionId)" - # Construct the AAD principal string for Kusto - $aadPrincipal = "aadapp=$clientId" + # Use principal-assignment API directly with known resource group + Write-Host " Creating database principal assignment..." - # Create the Kusto command to add database viewer - $kustoCommand = ".add database ['$($cluster.Database)'] viewers ('$aadPrincipal') 'Grafana Managed Identity - $managedIdentityName'" - - Write-Host " Executing Kusto command..." - Write-Host " Command: $kustoCommand" - - # Execute the command using Azure CLI try { - $result = az kusto script create ` + $assignmentResult = az kusto database-principal-assignment create ` --cluster-name $cluster.ClusterName ` --database-name $cluster.Database ` + --principal-assignment-name "grafana-$environment-viewer" ` + --principal-id $principalId ` + --principal-type App ` + --role Viewer ` --resource-group $cluster.ResourceGroup ` - --script-content $kustoCommand ` - --name "grant-grafana-access-$(Get-Date -Format 'yyyyMMddHHmmss')" ` - --force-update-tag "$(Get-Date -Format 'yyyyMMddHHmmss')" ` - --continue-on-errors false ` - --output none 2>&1 + --subscription $cluster.SubscriptionId ` + --tenant-id "72f988bf-86f1-41af-91ab-2d7cd011db47" ` + --output json 2>&1 if ($LASTEXITCODE -eq 0) { Write-Host " āœ“ Database viewer permission granted successfully" $successCount++ + } elseif ($assignmentResult -like "*already exists*" -or $assignmentResult -like "*AlreadyExists*" -or $assignmentResult -like "*Conflict*") { + Write-Host " āœ“ Database viewer permission already exists" + $successCount++ } else { Write-Warning " ⚠ Failed to grant database viewer permission" - Write-Warning " Error: $result" - Write-Host " ā„¹ļø Attempting alternative method using principal-assignment..." - - # Alternative: Use principal-assignment API - try { - az kusto database-principal-assignment create ` - --cluster-name $cluster.ClusterName ` - --database-name $cluster.Database ` - --principal-assignment-name "grafana-$environment-viewer" ` - --principal-id $principalId ` - --principal-type App ` - --role Viewer ` - --resource-group $cluster.ResourceGroup ` - --tenant-id "72f988bf-86f1-41af-91ab-2d7cd011db47" ` - --output none 2>&1 - - if ($LASTEXITCODE -eq 0) { - Write-Host " āœ“ Database viewer permission granted via principal-assignment" - $successCount++ - } else { - Write-Error " āœ— Both methods failed to grant database access" - $failCount++ - } - } catch { - Write-Error " āœ— Both methods failed to grant database access" - Write-Error " Exception: $($_.Exception.Message)" - $failCount++ - } + Write-Warning " Error: $assignmentResult" + Write-Host "" + Write-Host " Manual grant required - run this in Kusto Query Editor ($($cluster.ClusterUri)):" + Write-Host " .add database ['$($cluster.Database)'] viewers ('aadapp=$clientId') 'Grafana Managed Identity'" + $failCount++ + } + } catch { + Write-Warning " ⚠ Exception occurred: $($_.Exception.Message)" + Write-Host "" + Write-Host " Manual grant required - run this in Kusto Query Editor ($($cluster.ClusterUri)):" + Write-Host " .add database ['$($cluster.Database)'] viewers ('aadapp=$clientId') 'Grafana Managed Identity'" + Write-Warning " Error: $assignmentResult" + Write-Host "" + Write-Host " Manual grant required - run this in Kusto Query Editor ($($cluster.ClusterUri)):" + Write-Host " .add database ['$($cluster.Database)'] viewers ('aadapp=$clientId') 'Grafana Managed Identity'" + $failCount++ } } catch { - Write-Error " āœ— Failed to execute Kusto command" - Write-Error " Exception: $($_.Exception.Message)" + Write-Warning " ⚠ Exception occurred: $($_.Exception.Message)" + Write-Host "" + Write-Host " Manual grant required - run this in Kusto Query Editor ($($cluster.ClusterUri)):" + Write-Host " .add database ['$($cluster.Database)'] viewers ('aadapp=$clientId') 'Grafana Managed Identity'" $failCount++ } Write-Host "" } + Write-Host "==========================================" + Write-Host "Kusto Access Grant Summary" + Write-Host "==========================================" + Write-Host "āœ“ Successful: $successCount / $($kustoConfig.Count)" + if ($failCount -gt 0) { + Write-Host "⚠ Failed: $failCount / $($kustoConfig.Count)" + Write-Host "" + Write-Host "Note: Manual Kusto grants may be required." + Write-Host "The pipeline will continue, but dashboards using Kusto data may not work until permissions are granted." + } Write-Host "" Write-Host "ā„¹ļø Kusto permission changes are effective immediately" Write-Host "" From 9ad1f73273b05ed327f0f688ac33bbb9d50db3ce Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Sun, 23 Nov 2025 01:42:53 -0800 Subject: [PATCH 104/133] grant grafa MI access to engineeringdata --- eng/provision-grafana.yaml | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/eng/provision-grafana.yaml b/eng/provision-grafana.yaml index 6c90afea5..1adb1d54e 100644 --- a/eng/provision-grafana.yaml +++ b/eng/provision-grafana.yaml @@ -380,17 +380,6 @@ jobs: Write-Host " .add database ['$($cluster.Database)'] viewers ('aadapp=$clientId') 'Grafana Managed Identity'" $failCount++ } - } catch { - Write-Warning " ⚠ Exception occurred: $($_.Exception.Message)" - Write-Host "" - Write-Host " Manual grant required - run this in Kusto Query Editor ($($cluster.ClusterUri)):" - Write-Host " .add database ['$($cluster.Database)'] viewers ('aadapp=$clientId') 'Grafana Managed Identity'" - Write-Warning " Error: $assignmentResult" - Write-Host "" - Write-Host " Manual grant required - run this in Kusto Query Editor ($($cluster.ClusterUri)):" - Write-Host " .add database ['$($cluster.Database)'] viewers ('aadapp=$clientId') 'Grafana Managed Identity'" - $failCount++ - } } catch { Write-Warning " ⚠ Exception occurred: $($_.Exception.Message)" Write-Host "" From ddcce3ed4c98464cee4a9049d508c28161008fe8 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Sun, 23 Nov 2025 02:07:27 -0800 Subject: [PATCH 105/133] grant grafana MI access to engineeringdata --- eng/provision-grafana.yaml | 110 +++++++++++++++++++++++++++++++++++-- 1 file changed, 106 insertions(+), 4 deletions(-) diff --git a/eng/provision-grafana.yaml b/eng/provision-grafana.yaml index 1adb1d54e..e2eb3b12a 100644 --- a/eng/provision-grafana.yaml +++ b/eng/provision-grafana.yaml @@ -283,6 +283,104 @@ jobs: Write-Host "ā„¹ļø RBAC propagation can take 2-5 minutes for Azure Monitor queries to work" Write-Host "" + - task: AzureCLI@2 + displayName: 'Grant Pipeline Service Principal Kusto Permissions' + inputs: + azureSubscription: '${{ parameters.ServiceConnectionName }}' + scriptType: 'pscore' + scriptLocation: 'inlineScript' + inlineScript: | + Write-Host "==========================================" + Write-Host "Granting Pipeline Service Principal Kusto Contributor Role" + Write-Host "==========================================" + Write-Host "" + + $environment = "${{ parameters.DeploymentEnvironment }}" + + # Get the current service principal identity + $currentUser = az account show --query 'user.name' -o tsv + Write-Host "Current service principal: $currentUser" + Write-Host "" + + # Define Kusto clusters based on environment + $kustoConfig = @() + + if ($environment -eq "Staging") { + $kustoConfig += @{ + ClusterName = "engdata" + ResourceGroup = "helixstagingkusto" + SubscriptionId = "cab65fc3-d077-467d-931f-3932eabf36d3" + } + } else { + $kustoConfig += @{ + ClusterName = "engsrvprod" + ResourceGroup = "helixprodkusto" + SubscriptionId = "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f" + } + } + + Write-Host "Granting Kusto Contributor role on Kusto clusters..." + Write-Host "" + + $successCount = 0 + $failCount = 0 + + foreach ($cluster in $kustoConfig) { + Write-Host "Processing: $($cluster.ClusterName)" + Write-Host " Resource Group: $($cluster.ResourceGroup)" + Write-Host " Subscription: $($cluster.SubscriptionId)" + + $clusterScope = "/subscriptions/$($cluster.SubscriptionId)/resourceGroups/$($cluster.ResourceGroup)/providers/Microsoft.Kusto/clusters/$($cluster.ClusterName)" + + # Check if role assignment already exists + $existingAssignment = az role assignment list ` + --assignee $currentUser ` + --role "Contributor" ` + --scope $clusterScope ` + --query "[0].id" ` + --output tsv 2>$null + + if ($existingAssignment) { + Write-Host " āœ“ Role assignment already exists" + $successCount++ + } else { + Write-Host " Creating Contributor role assignment..." + + $result = az role assignment create ` + --role "Contributor" ` + --assignee $currentUser ` + --scope $clusterScope ` + --output none 2>&1 + + if ($LASTEXITCODE -eq 0) { + Write-Host " āœ“ Role assignment created successfully" + $successCount++ + } else { + Write-Warning " ⚠ Failed to create role assignment" + Write-Warning " Error: $result" + Write-Warning " This may need to be granted manually by subscription owners." + $failCount++ + } + } + + Write-Host "" + } + + Write-Host "==========================================" + Write-Host "Pipeline Kusto Permissions Summary" + Write-Host "==========================================" + Write-Host "āœ“ Successful: $successCount / $($kustoConfig.Count)" + if ($failCount -gt 0) { + Write-Warning "⚠ Failed: $failCount / $($kustoConfig.Count)" + Write-Warning "" + Write-Warning "Note: Pipeline service principal needs Contributor role on Kusto clusters." + Write-Warning "Kusto database permission grants may fail until this is resolved." + } + Write-Host "" + + # Don't fail the pipeline if this step fails - it can be done manually + exit 0 + - task: AzureCLI@2 displayName: 'Grant Grafana Identity Kusto Database Access' inputs: @@ -396,14 +494,18 @@ jobs: Write-Host "==========================================" Write-Host "āœ“ Successful: $successCount / $($kustoConfig.Count)" if ($failCount -gt 0) { - Write-Host "⚠ Failed: $failCount / $($kustoConfig.Count)" - Write-Host "" - Write-Host "Note: Manual Kusto grants may be required." - Write-Host "The pipeline will continue, but dashboards using Kusto data may not work until permissions are granted." + Write-Warning "⚠ Failed: $failCount / $($kustoConfig.Count)" + Write-Warning "" + Write-Warning "Note: Manual Kusto grants are required." + Write-Warning "The pipeline will continue, but dashboards using Kusto data may not work until permissions are granted." } Write-Host "" Write-Host "ā„¹ļø Kusto permission changes are effective immediately" Write-Host "" + + # Exit successfully even if some grants failed - this is not a critical error + # Manual grants can be done via the commands provided above + exit 0 - task: AzureCLI@2 displayName: 'Install Azure Managed Grafana Extension' From 6e3def10c70f742db6d30ef073a471a4d2da921a Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Sun, 23 Nov 2025 02:30:41 -0800 Subject: [PATCH 106/133] remove grafana MI access to engineeringdata --- eng/provision-grafana.yaml | 224 ------------------------------------- 1 file changed, 224 deletions(-) diff --git a/eng/provision-grafana.yaml b/eng/provision-grafana.yaml index e2eb3b12a..2aa67a0d9 100644 --- a/eng/provision-grafana.yaml +++ b/eng/provision-grafana.yaml @@ -283,230 +283,6 @@ jobs: Write-Host "ā„¹ļø RBAC propagation can take 2-5 minutes for Azure Monitor queries to work" Write-Host "" - - task: AzureCLI@2 - displayName: 'Grant Pipeline Service Principal Kusto Permissions' - inputs: - azureSubscription: '${{ parameters.ServiceConnectionName }}' - scriptType: 'pscore' - scriptLocation: 'inlineScript' - inlineScript: | - Write-Host "==========================================" - Write-Host "Granting Pipeline Service Principal Kusto Contributor Role" - Write-Host "==========================================" - Write-Host "" - - $environment = "${{ parameters.DeploymentEnvironment }}" - - # Get the current service principal identity - $currentUser = az account show --query 'user.name' -o tsv - Write-Host "Current service principal: $currentUser" - Write-Host "" - - # Define Kusto clusters based on environment - $kustoConfig = @() - - if ($environment -eq "Staging") { - $kustoConfig += @{ - ClusterName = "engdata" - ResourceGroup = "helixstagingkusto" - SubscriptionId = "cab65fc3-d077-467d-931f-3932eabf36d3" - } - } else { - $kustoConfig += @{ - ClusterName = "engsrvprod" - ResourceGroup = "helixprodkusto" - SubscriptionId = "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f" - } - } - - Write-Host "Granting Kusto Contributor role on Kusto clusters..." - Write-Host "" - - $successCount = 0 - $failCount = 0 - - foreach ($cluster in $kustoConfig) { - Write-Host "Processing: $($cluster.ClusterName)" - Write-Host " Resource Group: $($cluster.ResourceGroup)" - Write-Host " Subscription: $($cluster.SubscriptionId)" - - $clusterScope = "/subscriptions/$($cluster.SubscriptionId)/resourceGroups/$($cluster.ResourceGroup)/providers/Microsoft.Kusto/clusters/$($cluster.ClusterName)" - - # Check if role assignment already exists - $existingAssignment = az role assignment list ` - --assignee $currentUser ` - --role "Contributor" ` - --scope $clusterScope ` - --query "[0].id" ` - --output tsv 2>$null - - if ($existingAssignment) { - Write-Host " āœ“ Role assignment already exists" - $successCount++ - } else { - Write-Host " Creating Contributor role assignment..." - - $result = az role assignment create ` - --role "Contributor" ` - --assignee $currentUser ` - --scope $clusterScope ` - --output none 2>&1 - - if ($LASTEXITCODE -eq 0) { - Write-Host " āœ“ Role assignment created successfully" - $successCount++ - } else { - Write-Warning " ⚠ Failed to create role assignment" - Write-Warning " Error: $result" - Write-Warning " This may need to be granted manually by subscription owners." - $failCount++ - } - } - - Write-Host "" - } - - Write-Host "==========================================" - Write-Host "Pipeline Kusto Permissions Summary" - Write-Host "==========================================" - Write-Host "āœ“ Successful: $successCount / $($kustoConfig.Count)" - if ($failCount -gt 0) { - Write-Warning "⚠ Failed: $failCount / $($kustoConfig.Count)" - Write-Warning "" - Write-Warning "Note: Pipeline service principal needs Contributor role on Kusto clusters." - Write-Warning "Kusto database permission grants may fail until this is resolved." - } - Write-Host "" - - # Don't fail the pipeline if this step fails - it can be done manually - exit 0 - - - task: AzureCLI@2 - displayName: 'Grant Grafana Identity Kusto Database Access' - inputs: - azureSubscription: '${{ parameters.ServiceConnectionName }}' - scriptType: 'pscore' - scriptLocation: 'inlineScript' - inlineScript: | - Write-Host "==========================================" - Write-Host "Granting Kusto Database Access to Grafana Identity" - Write-Host "==========================================" - Write-Host "" - - $workspaceName = "${{ parameters.GrafanaWorkspaceName }}" - $rgName = "${{ parameters.GrafanaResourceGroup }}" - $environment = "${{ parameters.DeploymentEnvironment }}" - - # Get the user-assigned managed identity - $managedIdentityName = if ($environment -eq 'Production') { 'dnceng-managed-grafana' } else { 'dnceng-managed-grafana-staging' } - Write-Host "Retrieving managed identity: $managedIdentityName" - - $identity = az identity show --name $managedIdentityName --resource-group $rgName --query '{principalId:principalId, clientId:clientId}' --output json | ConvertFrom-Json - - if (-not $identity) { - Write-Error "Failed to retrieve managed identity: $managedIdentityName" - exit 1 - } - - $principalId = $identity.principalId - $clientId = $identity.clientId - - Write-Host "āœ“ Managed Identity: $managedIdentityName" - Write-Host "āœ“ Principal ID: $principalId" - Write-Host "āœ“ Client ID: $clientId" - Write-Host "" - - # Define Kusto clusters and databases based on environment - $kustoConfig = @() - - if ($environment -eq "Staging") { - $kustoConfig += @{ - ClusterName = "engdata" - ClusterUri = "https://engdata.westus2.kusto.windows.net" - Database = "engineeringdata" - ResourceGroup = "helixstagingkusto" - SubscriptionId = "cab65fc3-d077-467d-931f-3932eabf36d3" # HelixStaging subscription - } - } else { - $kustoConfig += @{ - ClusterName = "engsrvprod" - ClusterUri = "https://engsrvprod.westus.kusto.windows.net" - Database = "engineeringdata" - ResourceGroup = "helixprodkusto" - SubscriptionId = "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f" # Helix production subscription - } - } - - Write-Host "Granting database viewer permissions on Kusto clusters..." - Write-Host "" - - $successCount = 0 - $failCount = 0 - - foreach ($cluster in $kustoConfig) { - Write-Host "Processing: $($cluster.ClusterName) - $($cluster.Database)" - Write-Host " Resource Group: $($cluster.ResourceGroup)" - Write-Host " Subscription: $($cluster.SubscriptionId)" - - # Use principal-assignment API directly with known resource group - Write-Host " Creating database principal assignment..." - - try { - $assignmentResult = az kusto database-principal-assignment create ` - --cluster-name $cluster.ClusterName ` - --database-name $cluster.Database ` - --principal-assignment-name "grafana-$environment-viewer" ` - --principal-id $principalId ` - --principal-type App ` - --role Viewer ` - --resource-group $cluster.ResourceGroup ` - --subscription $cluster.SubscriptionId ` - --tenant-id "72f988bf-86f1-41af-91ab-2d7cd011db47" ` - --output json 2>&1 - - if ($LASTEXITCODE -eq 0) { - Write-Host " āœ“ Database viewer permission granted successfully" - $successCount++ - } elseif ($assignmentResult -like "*already exists*" -or $assignmentResult -like "*AlreadyExists*" -or $assignmentResult -like "*Conflict*") { - Write-Host " āœ“ Database viewer permission already exists" - $successCount++ - } else { - Write-Warning " ⚠ Failed to grant database viewer permission" - Write-Warning " Error: $assignmentResult" - Write-Host "" - Write-Host " Manual grant required - run this in Kusto Query Editor ($($cluster.ClusterUri)):" - Write-Host " .add database ['$($cluster.Database)'] viewers ('aadapp=$clientId') 'Grafana Managed Identity'" - $failCount++ - } - } catch { - Write-Warning " ⚠ Exception occurred: $($_.Exception.Message)" - Write-Host "" - Write-Host " Manual grant required - run this in Kusto Query Editor ($($cluster.ClusterUri)):" - Write-Host " .add database ['$($cluster.Database)'] viewers ('aadapp=$clientId') 'Grafana Managed Identity'" - $failCount++ - } - - Write-Host "" - } - - Write-Host "==========================================" - Write-Host "Kusto Access Grant Summary" - Write-Host "==========================================" - Write-Host "āœ“ Successful: $successCount / $($kustoConfig.Count)" - if ($failCount -gt 0) { - Write-Warning "⚠ Failed: $failCount / $($kustoConfig.Count)" - Write-Warning "" - Write-Warning "Note: Manual Kusto grants are required." - Write-Warning "The pipeline will continue, but dashboards using Kusto data may not work until permissions are granted." - } - Write-Host "" - Write-Host "ā„¹ļø Kusto permission changes are effective immediately" - Write-Host "" - - # Exit successfully even if some grants failed - this is not a critical error - # Manual grants can be done via the commands provided above - exit 0 - - task: AzureCLI@2 displayName: 'Install Azure Managed Grafana Extension' inputs: From e9d06c92c88808a92797ea56004e21e22b3185a7 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Sun, 23 Nov 2025 22:49:59 -0800 Subject: [PATCH 107/133] fix data source for dashboard --- .../arcade-services/arcadeAvailability.dashboard.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Monitoring/Monitoring.ArcadeServices/dashboard/arcade-services/arcadeAvailability.dashboard.json b/src/Monitoring/Monitoring.ArcadeServices/dashboard/arcade-services/arcadeAvailability.dashboard.json index 9299dae5a..68d1a432e 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/dashboard/arcade-services/arcadeAvailability.dashboard.json +++ b/src/Monitoring/Monitoring.ArcadeServices/dashboard/arcade-services/arcadeAvailability.dashboard.json @@ -1195,7 +1195,8 @@ "dimensionFilters": [], "metricDefinition": "Microsoft.Insights/components", "metricName": "pcs.queue.wait_time", - "metricNamespace": "Azure.ApplicationInsights", + "customNamespace": "Azure.ApplicationInsights", + "metricNamespace": "Microsoft.Insights/components", "resourceGroup": "[parameter(product-construction-service-resourcegroup)]", "resourceName": "[parameter(product-construction-service-appinsights-resourcename)]", "timeGrain": "auto" From 88c34a10e7fe042fbc0c74d5555b90766cd95c42 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Tue, 25 Nov 2025 11:47:00 -0800 Subject: [PATCH 108/133] update grafana dashboards --- ...-items-waiting-time-build-pools.alert.json | 167 ++++++++++++++++++ ...-items-waiting-time-test-queues.alert.json | 167 ++++++++++++++++++ ...-items-waiting-time-build-pools.alert.json | 167 ++++++++++++++++++ ...-items-waiting-time-test-queues.alert.json | 167 ++++++++++++++++++ .../dashboard/general/home.dashboard.json | 2 +- 5 files changed, 669 insertions(+), 1 deletion(-) create mode 100644 src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/work-items-waiting-time-build-pools.alert.json create mode 100644 src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/work-items-waiting-time-test-queues.alert.json create mode 100644 src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/work-items-waiting-time-build-pools.alert.json create mode 100644 src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/work-items-waiting-time-test-queues.alert.json diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/work-items-waiting-time-build-pools.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/work-items-waiting-time-build-pools.alert.json new file mode 100644 index 000000000..091d30cca --- /dev/null +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/work-items-waiting-time-build-pools.alert.json @@ -0,0 +1,167 @@ +{ + "uid": "work-items-waiting-time-build-pools", + "title": "Work Items Waiting Time Is Too High (Build Pools)", + "condition": "D", + "data": [ + { + "refId": "A", + "queryType": "", + "datasourceUid": "OlcfOPi7z", + "relativeTimeRange": { + "from": 86400, + "to": 0 + }, + "model": { + "database": "engineeringdata", + "datasource": { + "type": "grafana-azure-data-explorer-datasource", + "uid": "OlcfOPi7z" + }, + "expression": { + "groupBy": { + "expressions": [], + "type": "and" + }, + "reduce": { + "expressions": [], + "type": "and" + }, + "where": { + "expressions": [], + "type": "and" + } + }, + "query": "WorkItems \n| where $__timeFilter(Queued)\n| project QueueName = tolower(QueueName), duration = datetime_diff('second', Started, Queued), Queued\n| where QueueName !contains \"osx\" and QueueName !contains \"perf\" and QueueName !contains \"arm\" and QueueName !contains \"arcade\" and QueueName !contains \"xaml\" and QueueName !contains \"appcompat\"\n| where QueueName contains \"buildpool\"\n| summarize Percentile95 = percentile(duration, 95) / 60 by bin(Queued, 1d)\n| order by Queued asc ", + "querySource": "raw", + "rawMode": true, + "refId": "A", + "resultFormat": "table" + } + }, + { + "refId": "B", + "queryType": "", + "datasourceUid": "OlcfOPi7z", + "relativeTimeRange": { + "from": 86400, + "to": 0 + }, + "model": { + "database": "engineeringdata", + "datasource": { + "type": "grafana-azure-data-explorer-datasource", + "uid": "OlcfOPi7z" + }, + "expression": { + "groupBy": { + "expressions": [], + "type": "and" + }, + "reduce": { + "expressions": [], + "type": "and" + }, + "where": { + "expressions": [], + "type": "and" + } + }, + "query": "WorkItems \n| where $__timeFilter(Queued)\n| project QueueName = tolower(QueueName), duration = datetime_diff('second', Started, Queued), Queued\n| where QueueName !contains \"osx\" and QueueName !contains \"perf\" and QueueName !contains \"arm\" and QueueName !contains \"arcade\" and QueueName !contains \"xaml\" and QueueName !contains \"appcompat\"\n| where QueueName contains \"buildpool\"\n| summarize Percentile50 = percentile(duration, 50) / 60 by bin(Queued, 1d)\n| order by Queued asc ", + "querySource": "raw", + "rawMode": true, + "refId": "B", + "resultFormat": "table" + } + }, + { + "refId": "C", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "A", + "5m", + "now" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "B", + "reducer": "min", + "refId": "C", + "type": "reduce" + }, + "relativeTimeRange": { + "from": 300, + "to": 0 + } + }, + { + "refId": "D", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [ + 30 + ], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "C" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "C", + "refId": "D", + "type": "threshold" + } + } + ], + "noDataState": "OK", + "execErrState": "Alerting", + "for": "5m", + "frequency": "5m", + "annotations": { + "description": "95 percentile of work item waiting times is over 30 minutes. BuildPool queues only." + }, + "labels": { + "NotificationId": "work-items-waiting-time-build-pools" + }, + "folderUID": "arcade-services", + "ruleGroup": "Helix Queue Alerts", + "intervalMs": 300000, + "isPaused": false, + "notification_settings": { + "receiver": ".NET Status Alert", + "group_wait": "5m", + "repeat_interval": "4h" + } +} diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/work-items-waiting-time-test-queues.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/work-items-waiting-time-test-queues.alert.json new file mode 100644 index 000000000..24736ef49 --- /dev/null +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/work-items-waiting-time-test-queues.alert.json @@ -0,0 +1,167 @@ +{ + "uid": "work-items-waiting-time-test-queues", + "title": "Work Items Waiting Time Is Too High (Test Queues)", + "condition": "D", + "data": [ + { + "refId": "A", + "queryType": "", + "datasourceUid": "OlcfOPi7z", + "relativeTimeRange": { + "from": 86400, + "to": 0 + }, + "model": { + "database": "engineeringdata", + "datasource": { + "type": "grafana-azure-data-explorer-datasource", + "uid": "OlcfOPi7z" + }, + "expression": { + "groupBy": { + "expressions": [], + "type": "and" + }, + "reduce": { + "expressions": [], + "type": "and" + }, + "where": { + "expressions": [], + "type": "and" + } + }, + "query": "WorkItems \n| where $__timeFilter(Queued)\n| project QueueName = tolower(QueueName), duration = datetime_diff('second', Started, Queued), Queued\n| where QueueName !contains \"osx\" and QueueName !contains \"perf\" and QueueName !contains \"arm\" and QueueName !contains \"arcade\" and QueueName !contains \"xaml\" and QueueName !contains \"appcompat\" and QueueName !contains \".tof\"\n| where QueueName !contains \"buildpool\"\n| summarize Percentile95 = percentile(duration, 95) / 60 by bin(Queued, 1d)\n| order by Queued asc ", + "querySource": "raw", + "rawMode": true, + "refId": "A", + "resultFormat": "table" + } + }, + { + "refId": "B", + "queryType": "", + "datasourceUid": "OlcfOPi7z", + "relativeTimeRange": { + "from": 86400, + "to": 0 + }, + "model": { + "database": "engineeringdata", + "datasource": { + "type": "grafana-azure-data-explorer-datasource", + "uid": "OlcfOPi7z" + }, + "expression": { + "groupBy": { + "expressions": [], + "type": "and" + }, + "reduce": { + "expressions": [], + "type": "and" + }, + "where": { + "expressions": [], + "type": "and" + } + }, + "query": "WorkItems \n| where $__timeFilter(Queued)\n| project QueueName = tolower(QueueName), duration = datetime_diff('second', Started, Queued), Queued\n| where QueueName !contains \"osx\" and QueueName !contains \"perf\" and QueueName !contains \"arm\" and QueueName !contains \"arcade\" and QueueName !contains \"xaml\" and QueueName !contains \"appcompat\"\n| where QueueName !contains \"buildpool\"\n| summarize Percentile50 = percentile(duration, 50) / 60 by bin(Queued, 1d)\n| order by Queued asc ", + "querySource": "raw", + "rawMode": true, + "refId": "B", + "resultFormat": "table" + } + }, + { + "refId": "C", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "A", + "5m", + "now" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "A", + "reducer": "min", + "refId": "C", + "type": "reduce" + }, + "relativeTimeRange": { + "from": 300, + "to": 0 + } + }, + { + "refId": "D", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [ + 35 + ], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "C" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "C", + "refId": "D", + "type": "threshold" + } + } + ], + "noDataState": "OK", + "execErrState": "Alerting", + "for": "5m", + "frequency": "5m", + "annotations": { + "description": "95 percentile of work item waiting times is over 35 minutes. Test queues only." + }, + "labels": { + "NotificationId": "work-items-waiting-time-test-queues" + }, + "folderUID": "arcade-services", + "ruleGroup": "Helix Queue Alerts", + "intervalMs": 300000, + "isPaused": false, + "notification_settings": { + "receiver": ".NET Status Alert", + "group_wait": "5m", + "repeat_interval": "4h" + } +} diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/work-items-waiting-time-build-pools.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/work-items-waiting-time-build-pools.alert.json new file mode 100644 index 000000000..091d30cca --- /dev/null +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/work-items-waiting-time-build-pools.alert.json @@ -0,0 +1,167 @@ +{ + "uid": "work-items-waiting-time-build-pools", + "title": "Work Items Waiting Time Is Too High (Build Pools)", + "condition": "D", + "data": [ + { + "refId": "A", + "queryType": "", + "datasourceUid": "OlcfOPi7z", + "relativeTimeRange": { + "from": 86400, + "to": 0 + }, + "model": { + "database": "engineeringdata", + "datasource": { + "type": "grafana-azure-data-explorer-datasource", + "uid": "OlcfOPi7z" + }, + "expression": { + "groupBy": { + "expressions": [], + "type": "and" + }, + "reduce": { + "expressions": [], + "type": "and" + }, + "where": { + "expressions": [], + "type": "and" + } + }, + "query": "WorkItems \n| where $__timeFilter(Queued)\n| project QueueName = tolower(QueueName), duration = datetime_diff('second', Started, Queued), Queued\n| where QueueName !contains \"osx\" and QueueName !contains \"perf\" and QueueName !contains \"arm\" and QueueName !contains \"arcade\" and QueueName !contains \"xaml\" and QueueName !contains \"appcompat\"\n| where QueueName contains \"buildpool\"\n| summarize Percentile95 = percentile(duration, 95) / 60 by bin(Queued, 1d)\n| order by Queued asc ", + "querySource": "raw", + "rawMode": true, + "refId": "A", + "resultFormat": "table" + } + }, + { + "refId": "B", + "queryType": "", + "datasourceUid": "OlcfOPi7z", + "relativeTimeRange": { + "from": 86400, + "to": 0 + }, + "model": { + "database": "engineeringdata", + "datasource": { + "type": "grafana-azure-data-explorer-datasource", + "uid": "OlcfOPi7z" + }, + "expression": { + "groupBy": { + "expressions": [], + "type": "and" + }, + "reduce": { + "expressions": [], + "type": "and" + }, + "where": { + "expressions": [], + "type": "and" + } + }, + "query": "WorkItems \n| where $__timeFilter(Queued)\n| project QueueName = tolower(QueueName), duration = datetime_diff('second', Started, Queued), Queued\n| where QueueName !contains \"osx\" and QueueName !contains \"perf\" and QueueName !contains \"arm\" and QueueName !contains \"arcade\" and QueueName !contains \"xaml\" and QueueName !contains \"appcompat\"\n| where QueueName contains \"buildpool\"\n| summarize Percentile50 = percentile(duration, 50) / 60 by bin(Queued, 1d)\n| order by Queued asc ", + "querySource": "raw", + "rawMode": true, + "refId": "B", + "resultFormat": "table" + } + }, + { + "refId": "C", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "A", + "5m", + "now" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "B", + "reducer": "min", + "refId": "C", + "type": "reduce" + }, + "relativeTimeRange": { + "from": 300, + "to": 0 + } + }, + { + "refId": "D", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [ + 30 + ], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "C" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "C", + "refId": "D", + "type": "threshold" + } + } + ], + "noDataState": "OK", + "execErrState": "Alerting", + "for": "5m", + "frequency": "5m", + "annotations": { + "description": "95 percentile of work item waiting times is over 30 minutes. BuildPool queues only." + }, + "labels": { + "NotificationId": "work-items-waiting-time-build-pools" + }, + "folderUID": "arcade-services", + "ruleGroup": "Helix Queue Alerts", + "intervalMs": 300000, + "isPaused": false, + "notification_settings": { + "receiver": ".NET Status Alert", + "group_wait": "5m", + "repeat_interval": "4h" + } +} diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/work-items-waiting-time-test-queues.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/work-items-waiting-time-test-queues.alert.json new file mode 100644 index 000000000..24736ef49 --- /dev/null +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/work-items-waiting-time-test-queues.alert.json @@ -0,0 +1,167 @@ +{ + "uid": "work-items-waiting-time-test-queues", + "title": "Work Items Waiting Time Is Too High (Test Queues)", + "condition": "D", + "data": [ + { + "refId": "A", + "queryType": "", + "datasourceUid": "OlcfOPi7z", + "relativeTimeRange": { + "from": 86400, + "to": 0 + }, + "model": { + "database": "engineeringdata", + "datasource": { + "type": "grafana-azure-data-explorer-datasource", + "uid": "OlcfOPi7z" + }, + "expression": { + "groupBy": { + "expressions": [], + "type": "and" + }, + "reduce": { + "expressions": [], + "type": "and" + }, + "where": { + "expressions": [], + "type": "and" + } + }, + "query": "WorkItems \n| where $__timeFilter(Queued)\n| project QueueName = tolower(QueueName), duration = datetime_diff('second', Started, Queued), Queued\n| where QueueName !contains \"osx\" and QueueName !contains \"perf\" and QueueName !contains \"arm\" and QueueName !contains \"arcade\" and QueueName !contains \"xaml\" and QueueName !contains \"appcompat\" and QueueName !contains \".tof\"\n| where QueueName !contains \"buildpool\"\n| summarize Percentile95 = percentile(duration, 95) / 60 by bin(Queued, 1d)\n| order by Queued asc ", + "querySource": "raw", + "rawMode": true, + "refId": "A", + "resultFormat": "table" + } + }, + { + "refId": "B", + "queryType": "", + "datasourceUid": "OlcfOPi7z", + "relativeTimeRange": { + "from": 86400, + "to": 0 + }, + "model": { + "database": "engineeringdata", + "datasource": { + "type": "grafana-azure-data-explorer-datasource", + "uid": "OlcfOPi7z" + }, + "expression": { + "groupBy": { + "expressions": [], + "type": "and" + }, + "reduce": { + "expressions": [], + "type": "and" + }, + "where": { + "expressions": [], + "type": "and" + } + }, + "query": "WorkItems \n| where $__timeFilter(Queued)\n| project QueueName = tolower(QueueName), duration = datetime_diff('second', Started, Queued), Queued\n| where QueueName !contains \"osx\" and QueueName !contains \"perf\" and QueueName !contains \"arm\" and QueueName !contains \"arcade\" and QueueName !contains \"xaml\" and QueueName !contains \"appcompat\"\n| where QueueName !contains \"buildpool\"\n| summarize Percentile50 = percentile(duration, 50) / 60 by bin(Queued, 1d)\n| order by Queued asc ", + "querySource": "raw", + "rawMode": true, + "refId": "B", + "resultFormat": "table" + } + }, + { + "refId": "C", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "A", + "5m", + "now" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "A", + "reducer": "min", + "refId": "C", + "type": "reduce" + }, + "relativeTimeRange": { + "from": 300, + "to": 0 + } + }, + { + "refId": "D", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [ + 35 + ], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "C" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "C", + "refId": "D", + "type": "threshold" + } + } + ], + "noDataState": "OK", + "execErrState": "Alerting", + "for": "5m", + "frequency": "5m", + "annotations": { + "description": "95 percentile of work item waiting times is over 35 minutes. Test queues only." + }, + "labels": { + "NotificationId": "work-items-waiting-time-test-queues" + }, + "folderUID": "arcade-services", + "ruleGroup": "Helix Queue Alerts", + "intervalMs": 300000, + "isPaused": false, + "notification_settings": { + "receiver": ".NET Status Alert", + "group_wait": "5m", + "repeat_interval": "4h" + } +} diff --git a/src/Monitoring/Monitoring.ArcadeServices/dashboard/general/home.dashboard.json b/src/Monitoring/Monitoring.ArcadeServices/dashboard/general/home.dashboard.json index 18a68c6ac..a0c0055f3 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/dashboard/general/home.dashboard.json +++ b/src/Monitoring/Monitoring.ArcadeServices/dashboard/general/home.dashboard.json @@ -36,7 +36,7 @@ "content": "\n# .NET Engineering Systems Monitoring\n\nFor questions or permission issues, email [dnceng@microsoft.com](mailto:dnceng@microsoft.com)\n\nThis monitoring site is used to monitor all services managed by the .NET Engineering team.\nFor information about what sorts of things are monitored, and how to go about adding new monitoring or alerting, see the [Guidance](https://github.com/dotnet/core-eng/blob/master/Documentation/Alerting.md).\n\nTo see information about privacy and cookies visit: [Microsoft Privacy Statement](https://go.microsoft.com/fwlink/?LinkId=521839).\n\n", "mode": "markdown" }, - "title": "Home", + "title": "Introduction", "type": "text" }, { From 5606316ff81ebb34a834c391f101455e5a4f3098 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Tue, 25 Nov 2025 12:48:57 -0800 Subject: [PATCH 109/133] set homepage preference --- src/Monitoring/Sdk/GrafanaClient.cs | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/src/Monitoring/Sdk/GrafanaClient.cs b/src/Monitoring/Sdk/GrafanaClient.cs index ac04a9375..e9b8c44ae 100644 --- a/src/Monitoring/Sdk/GrafanaClient.cs +++ b/src/Monitoring/Sdk/GrafanaClient.cs @@ -436,13 +436,24 @@ public async Task CreateAlertRuleAsync(JObject alertRule) public async Task SetHomeDashboardAsync(string dashboardUid) { + // Set organization preferences (home dashboard and timezone) var preferences = new JObject { - {"homeDashboardUID", dashboardUid} + {"homeDashboardUID", dashboardUid}, + {"timezone", "browser"} }; - var uri = new Uri(new Uri(_baseUrl), "/api/org/preferences"); - await SendObjectAsync(preferences, uri, HttpMethod.Put).ConfigureAwait(false); + var preferencesUri = new Uri(new Uri(_baseUrl), "/api/org/preferences"); + await SendObjectAsync(preferences, preferencesUri, HttpMethod.Put).ConfigureAwait(false); + + // Set organization name + var orgDetails = new JObject + { + {"name", ".NET Engineering Services"} + }; + + var orgUri = new Uri(new Uri(_baseUrl), "/api/org"); + await SendObjectAsync(orgDetails, orgUri, HttpMethod.Put).ConfigureAwait(false); } public void Dispose() From c223f4578f33c9bd6b15ac0eb07031913556adc1 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Wed, 26 Nov 2025 18:31:38 -0800 Subject: [PATCH 110/133] fix dashboard alert annotations --- .../Production/cores-consumption.alert.json | 8 +- ...otneteng-status-failed-requests.alert.json | 14 +- .../helix-api-availability.alert.json | 8 +- ...helix-api-average-response-time.alert.json | 4 + ...elix-autoscaler-service-stopped.alert.json | 6 +- .../pcs-background-worker-stopped.alert.json | 4 + ...ontainer-job-execution-failures.alert.json | 4 + .../pcs-disk-space-issues.alert.json | 4 + .../Production/pcs-exceptions-high.alert.json | 4 + .../pcs-git-push-success-rate.alert.json | 4 + .../pcs-work-item-success-rate.alert.json | 4 + .../Production/quota-eastus.alert.json | 250 +++++++------- .../Production/quota-westus.alert.json | 212 ++++++------ .../Production/quota-westus2.alert.json | 4 + .../source-dot-net-availability.alert.json | 6 +- ...-items-waiting-time-build-pools.alert.json | 324 +++++++++--------- ...-items-waiting-time-test-queues.alert.json | 4 + .../Staging/cores-consumption.alert.json | 8 +- ...otneteng-status-failed-requests.alert.json | 14 +- .../Staging/helix-api-availability.alert.json | 8 +- ...helix-api-average-response-time.alert.json | 4 + ...elix-autoscaler-service-stopped.alert.json | 6 +- .../pcs-background-worker-stopped.alert.json | 14 +- ...ontainer-job-execution-failures.alert.json | 4 + .../Staging/pcs-disk-space-issues.alert.json | 4 + .../Staging/pcs-exceptions-high.alert.json | 14 +- .../pcs-git-push-success-rate.alert.json | 14 +- .../pcs-work-item-success-rate.alert.json | 4 + .../Staging/quota-eastus.alert.json | 250 +++++++------- .../Staging/quota-westus.alert.json | 212 ++++++------ .../Staging/quota-westus2.alert.json | 4 + .../source-dot-net-availability.alert.json | 6 +- ...-items-waiting-time-build-pools.alert.json | 324 +++++++++--------- ...-items-waiting-time-test-queues.alert.json | 324 +++++++++--------- .../arcadeAvailability.dashboard.json | 40 ++- .../arcade-services/quota.dashboard.json | 34 +- 36 files changed, 1176 insertions(+), 976 deletions(-) diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/cores-consumption.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/cores-consumption.alert.json index 3450d1b7b..72d8d8783 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/cores-consumption.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/cores-consumption.alert.json @@ -14,7 +14,9 @@ "model": { "azureLogAnalytics": { "query": "let quotaPerSubscription = customEvents \n| where $__timeFilter(timestamp)\n| where name == \"AzureSubscriptionQuotaLimit\"\n| project \n quota = toint(customMeasurements.quota),\n subscription = tostring(customDimensions.subscriptionId),\n timestamp\n| summarize arg_max(timestamp, quota) by subscription\n| project quota, subscription;\ncustomEvents\n| where $__timeFilter(timestamp)\n| where name == 'AzureResourceUsage'\n| where customDimensions.name == \"standardDv3Family\" or customDimensions.name == \"standardDAv4Family\"\n| project \n cores = toreal(customMeasurements.current),\n subscription = tostring(customDimensions.subscription),\n timestamp\n| join kind=inner quotaPerSubscription on subscription\n| project ['limit'] = quota, cores, timestamp, subscription\n| extend ['percent of limit'] = iff(['limit'] == 0, 0.0, cores/['limit'] * 100)\n| summarize ['percent of limit'] = max(['percent of limit']) by bin(timestamp, $__interval), subscription\n| order by timestamp asc", - "resource": "[parameter(dotnet-eng-appinsights-resourcepath)]", + "resources": [ + "[parameter(dotnet-eng-appinsights-resourcepath)]" + ], "resultFormat": "time_series", "workspace": "[parameter(default-workspace-resourcepath)]" }, @@ -95,11 +97,15 @@ "for": "5m", "frequency": "1m", "annotations": { + "__dashboardUid__": "quota", + "__panelId__": "15", "description": "Cores consumption by Autoscaler is above 95% of limit" }, "labels": { "NotificationId": "66b2ef8da5c74a2fbbc7d6739f55e4e8" }, + "__dashboardUid__": "quota", + "__panelId__": "15", "folderUID": "arcade-services", "ruleGroup": "Azure Quota Alerts", "intervalMs": 900000, diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/dotneteng-status-failed-requests.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/dotneteng-status-failed-requests.alert.json index e8b66ef92..1cf07a889 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/dotneteng-status-failed-requests.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/dotneteng-status-failed-requests.alert.json @@ -86,14 +86,18 @@ "conditions": [ { "evaluator": { - "params": [20], + "params": [ + 20 + ], "type": "gt" }, "operator": { "type": "and" }, "query": { - "params": ["B"] + "params": [ + "B" + ] }, "type": "query" } @@ -113,11 +117,15 @@ "for": "5m", "frequency": "1m", "annotations": { + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "44", "description": "The number of failed DotNetEng Status requests per hour is above 20. This may indicate a systemic problem that needs to be investigated.\\nTo intially investigate prod, run the following query in DotNetEng-Status-Prod, and to investigate staging, run the query in DotNetEng-Status-Staging:\\n\\n```\\nunion exceptions, traces\\n| project timestamp, operation_Name, customDimensions, message, problemId, details\\n| order by timestamp asc\\n```" }, "labels": { "NotificationId": "d2dd705a6c724ed68fcf6955561c06dd" }, + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "44", "folderUID": "arcade-services", "ruleGroup": "DotNetEng Status Alerts", "intervalMs": 900000, @@ -127,4 +135,4 @@ "group_wait": "5m", "repeat_interval": "4h" } -} +} \ No newline at end of file diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/helix-api-availability.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/helix-api-availability.alert.json index 85e307d34..266e38272 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/helix-api-availability.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/helix-api-availability.alert.json @@ -62,7 +62,7 @@ "subscriptions": [ "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", "cab65fc3-d077-467d-931f-3932eabf36d3" - ] + ] } }, { @@ -142,11 +142,15 @@ "for": "5m", "frequency": "1m", "annotations": { + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "18", "description": "Helix API availability alert!" }, "labels": { "NotificationId": "6179576701874a7abc440a574cf636d0" }, + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "18", "folderUID": "arcade-services", "ruleGroup": "Helix Alerts", "intervalMs": 900000, @@ -156,4 +160,4 @@ "group_wait": "5m", "repeat_interval": "4h" } -} +} \ No newline at end of file diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/helix-api-average-response-time.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/helix-api-average-response-time.alert.json index b3cfd67b6..12b8766ac 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/helix-api-average-response-time.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/helix-api-average-response-time.alert.json @@ -131,11 +131,15 @@ "for": "5m", "frequency": "1m", "annotations": { + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "19", "description": "Helix API Average Response Time is high!" }, "labels": { "NotificationId": "24cae10d9eca44079e7cf3d47f148497" }, + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "19", "folderUID": "arcade-services", "ruleGroup": "Helix Alerts", "intervalMs": 900000, diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/helix-autoscaler-service-stopped.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/helix-autoscaler-service-stopped.alert.json index cb3c942e3..ee7b4c4d3 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/helix-autoscaler-service-stopped.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/helix-autoscaler-service-stopped.alert.json @@ -139,11 +139,15 @@ "for": "5m", "frequency": "1m", "annotations": { + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "29", "description": "Helix AutoScaler Service has stopped running - no traces detected in the last 30 minutes." }, "labels": { "NotificationId": "6213d3c5ce9a46278343bf075798e46f" }, + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "29", "folderUID": "arcade-services", "ruleGroup": "Helix Alerts", "intervalMs": 900000, @@ -153,4 +157,4 @@ "group_wait": "5m", "repeat_interval": "4h" } -} +} \ No newline at end of file diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-background-worker-stopped.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-background-worker-stopped.alert.json index 082c7bfbc..ea948a483 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-background-worker-stopped.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-background-worker-stopped.alert.json @@ -139,11 +139,15 @@ "for": "5m", "frequency": "1m", "annotations": { + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "57", "description": "[!IMPORTANT]\n[Description and instructions for this alert](https://dev.azure.com/dnceng/internal/_wiki/wikis/DNCEng%20Services%20Wiki/1308/-Alert-PCS-Background-Worker-Stopped)\n\nPCS appears to have stopped processing new WorkItems.\n\n@dotnet/prodconsvcs" }, "labels": { "NotificationId": "23909d48866646408f669cc1c3d325ee" }, + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "57", "folderUID": "arcade-services", "ruleGroup": "PCS Alerts", "intervalMs": 900000, diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-container-job-execution-failures.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-container-job-execution-failures.alert.json index 1a18b0ff1..aa979ab79 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-container-job-execution-failures.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-container-job-execution-failures.alert.json @@ -122,11 +122,15 @@ "for": "5m", "frequency": "1m", "annotations": { + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "74", "description": "[Description and instructions for this alert](https://dev.azure.com/dnceng/internal/_wiki/wikis/DNCEng%20Services%20Wiki/1350/-Alert-PCS-container-job-execution-failing)\\n\\nPlease note that this alert will fire every 12 hours as the list of failed jobs can change" }, "labels": { "NotificationId": "0a5c68b0daf846ef83a66c6c70fd24ad" }, + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "74", "folderUID": "arcade-services", "ruleGroup": "PCS Alerts", "intervalMs": 900000, diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-disk-space-issues.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-disk-space-issues.alert.json index 12adcd112..fc8ac0dea 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-disk-space-issues.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-disk-space-issues.alert.json @@ -111,11 +111,15 @@ "for": "5m", "frequency": "5m", "annotations": { + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "72", "description": "[!IMPORTANT]\n[Description and instructions for this alert](https://dev.azure.com/dnceng/internal/_wiki/wikis/DNCEng%20Services%20Wiki/1346/-Alert-PCS-Disk-Space-Issues)\n\nThe PCS service is running out of disk space.\n\n@dotnet/prodconsvcs" }, "labels": { "NotificationId": "aa1fe025a8954b6cad9866354ca041ee" }, + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "72", "folderUID": "arcade-services", "ruleGroup": "PCS Alerts", "intervalMs": 900000, diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-exceptions-high.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-exceptions-high.alert.json index 9a9cc78e7..bc6479b60 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-exceptions-high.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-exceptions-high.alert.json @@ -111,11 +111,15 @@ "for": "5m", "frequency": "1m", "annotations": { + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "46", "description": "[!IMPORTANT]\n[Description and instructions for this alert](https://dev.azure.com/dnceng/internal/_wiki/wikis/DNCEng%20Services%20Wiki/1311/-Alert-PCS-Exceptions-High)\n\nThe PCS background work items started to fail frequently.\n\n@dotnet/prodconsvcs" }, "labels": { "NotificationId": "08f669cc1c3d325ee488666464" }, + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "46", "folderUID": "arcade-services", "ruleGroup": "PCS Alerts", "intervalMs": 900000, diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-git-push-success-rate.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-git-push-success-rate.alert.json index 9fa19d01b..1d5c2f62e 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-git-push-success-rate.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-git-push-success-rate.alert.json @@ -111,11 +111,15 @@ "for": "5m", "frequency": "1m", "annotations": { + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "75", "description": "[!IMPORTANT]\\n[Description and instructions for this alert](https://dev.azure.com/dnceng/internal/_wiki/wikis/DNCEng%20Services%20Wiki/1318/-Alert-PCS-high-git-push-failure-rate)\\n\\nPCS has a high `git push` failure rate, please investigate\\n\\n@dotnet/prodconsvcs" }, "labels": { "NotificationId": "6ggqnvwrunnru1zfl4g42dn9qjzanb8a" }, + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "75", "folderUID": "arcade-services", "ruleGroup": "PCS Alerts", "intervalSeconds": 60, diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-work-item-success-rate.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-work-item-success-rate.alert.json index 6e11755a7..61290ac69 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-work-item-success-rate.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-work-item-success-rate.alert.json @@ -151,11 +151,15 @@ "for": "5m", "frequency": "1m", "annotations": { + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "64", "description": "[!IMPORTANT]\n[Description and instructions for this alert](https://dev.azure.com/dnceng/internal/_wiki/wikis/DNCEng%20Services%20Wiki/1310/-Alert-PCS-Work-Item-Success-Rate-alert)\n\nThe PCS background work items started to fail frequently.\n\n@dotnet/prodconsvcs" }, "labels": { "NotificationId": "d71fe025a8954b6cad9866354ca041ee" }, + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "64", "folderUID": "arcade-services", "ruleGroup": "PCS Alerts", "intervalMs": 900000, diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/quota-eastus.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/quota-eastus.alert.json index 0c8dd6207..ebdc0ffd0 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/quota-eastus.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/quota-eastus.alert.json @@ -1,127 +1,135 @@ { - "uid": "quota-eastus", - "title": "Azure quota usage for east us", - "condition": "C", - "data": [ - { - "refId": "A", - "queryType": "Azure Log Analytics", - "datasourceUid": "F2XodEi7z", - "model": { - "azureLogAnalytics": { - "query": "customEvents\n| where $__timeFilter(timestamp)\n| where name == 'AzureResourceUsage'\n| project\n timestamp,\n resource = tostring(customDimensions.name),\n location = tostring(customDimensions.location),\n current = toreal(customMeasurements.current),\n ['limit'] = toreal(customMeasurements.['limit']),\n subscription = tostring(customDimensions.subscription)\n| where resource != 'NetworkWatchers'\n| where location == 'eastus'\n| where subscription in (\"cab65fc3-d077-467d-931f-3932eabf36d3\", \"68672ab8-de0c-40f1-8d1b-ffb20bd62c0f\")\n| extend ['percent of limit'] = iff(['limit'] == 0, 0.0, current/['limit'] * 100)\n| summarize ['percent of limit'] = max(['percent of limit']) by bin(timestamp, $__interval), resource\n| order by timestamp asc, resource", - "resource": "[parameter(dotnet-eng-appinsights-resourcepath)]", - "resultFormat": "time_series", - "workspace": "[parameter(default-workspace-resourcepath)]" - }, - "azureMonitor": { - "dimensionFilter": "*", - "dimensionFilters": [], - "timeGrain": "auto", - "top": "10" - }, - "datasource": { - "type": "grafana-azure-monitor-datasource", - "uid": "F2XodEi7z" - }, - "queryType": "Azure Log Analytics", - "refId": "A", - "subscription": "[parameter(dotnet-eng-appinsights-subscriptionid)]", - "subscriptions": [ - "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", - "cab65fc3-d077-467d-931f-3932eabf36d3" - ] - }, - "relativeTimeRange": { - "from": 86400, - "to": 0 - } - }, - { - "refId": "B", - "queryType": "", - "datasourceUid": "-100", - "model": { - "conditions": [ - { - "evaluator": { - "params": [], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "query": { - "params": [ - "A", - "5m", - "now" - ] + "uid": "quota-eastus", + "title": "Azure quota usage for east us", + "condition": "C", + "data": [ + { + "refId": "A", + "queryType": "Azure Log Analytics", + "datasourceUid": "F2XodEi7z", + "model": { + "azureLogAnalytics": { + "query": "customEvents\n| where $__timeFilter(timestamp)\n| where name == 'AzureResourceUsage'\n| project\n timestamp,\n resource = tostring(customDimensions.name),\n location = tostring(customDimensions.location),\n current = toreal(customMeasurements.current),\n ['limit'] = toreal(customMeasurements.['limit']),\n subscription = tostring(customDimensions.subscription)\n| where resource != 'NetworkWatchers'\n| where location == 'eastus'\n| where subscription in (\"cab65fc3-d077-467d-931f-3932eabf36d3\", \"68672ab8-de0c-40f1-8d1b-ffb20bd62c0f\")\n| extend ['percent of limit'] = iff(['limit'] == 0, 0.0, current/['limit'] * 100)\n| summarize ['percent of limit'] = max(['percent of limit']) by bin(timestamp, $__interval), resource\n| order by timestamp asc, resource", + "resource": "[parameter(dotnet-eng-appinsights-resourcepath)]", + "resultFormat": "time_series", + "workspace": "[parameter(default-workspace-resourcepath)]" + }, + "azureMonitor": { + "dimensionFilter": "*", + "dimensionFilters": [], + "timeGrain": "auto", + "top": "10" + }, + "datasource": { + "type": "grafana-azure-monitor-datasource", + "uid": "F2XodEi7z" + }, + "queryType": "Azure Log Analytics", + "refId": "A", + "subscription": "[parameter(dotnet-eng-appinsights-subscriptionid)]", + "subscriptions": [ + "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", + "cab65fc3-d077-467d-931f-3932eabf36d3" + ] }, - "type": "query" - } - ], - "datasource": { - "type": "__expr__", - "uid": "-100" + "relativeTimeRange": { + "from": 86400, + "to": 0 + } }, - "expression": "A", - "reducer": "mean", - "refId": "B", - "type": "reduce" - }, - "relativeTimeRange": { - "from": 300, - "to": 0 - } - }, - { - "refId": "C", - "queryType": "", - "datasourceUid": "-100", - "model": { - "conditions": [ - { - "evaluator": { - "params": [95], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "query": { - "params": ["B"] + { + "refId": "B", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "A", + "5m", + "now" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "A", + "reducer": "mean", + "refId": "B", + "type": "reduce" }, - "type": "query" - } - ], - "datasource": { - "type": "__expr__", - "uid": "-100" + "relativeTimeRange": { + "from": 300, + "to": 0 + } }, - "expression": "B", - "refId": "C", - "type": "threshold" - } + { + "refId": "C", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [ + 95 + ], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "B" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "B", + "refId": "C", + "type": "threshold" + } + } + ], + "noDataState": "KeepLast", + "execErrState": "KeepLast", + "for": "5m", + "frequency": "1m", + "annotations": { + "__dashboardUid__": "quota", + "__panelId__": "24", + "description": "An Azure Resource Quota is nearing its limit in region eastus!" + }, + "labels": { + "NotificationId": "b50b57fa7d1840438da5232711af4485" + }, + "__dashboardUid__": "quota", + "__panelId__": "24", + "folderUID": "arcade-services", + "ruleGroup": "Azure Quota Alerts", + "intervalMs": 900000, + "isPaused": false, + "notification_settings": { + "receiver": ".NET Status Alert", + "group_wait": "5m", + "repeat_interval": "4h" } - ], - "noDataState": "KeepLast", - "execErrState": "KeepLast", - "for": "5m", - "frequency": "1m", - "annotations": { - "description": "An Azure Resource Quota is nearing its limit in region eastus!" - }, - "labels": { - "NotificationId": "b50b57fa7d1840438da5232711af4485" - }, - "folderUID": "arcade-services", - "ruleGroup": "Azure Quota Alerts", - "intervalMs": 900000, - "isPaused": false, - "notification_settings": { - "receiver": ".NET Status Alert", - "group_wait": "5m", - "repeat_interval": "4h" - } -} +} \ No newline at end of file diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/quota-westus.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/quota-westus.alert.json index ef697622f..beea05ea9 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/quota-westus.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/quota-westus.alert.json @@ -1,108 +1,116 @@ { - "uid": "quota-westus", - "title": "Azure quota usage for west us", - "condition": "C", - "data": [ - { - "refId": "A", - "queryType": "Azure Log Analytics", - "datasourceUid": "F2XodEi7z", - "model": { - "azureLogAnalytics": { - "query": "customEvents\n| where $__timeFilter(timestamp)\n| where name == 'AzureResourceUsage'\n| project\n timestamp,\n resource = tostring(customDimensions.name),\n location = tostring(customDimensions.location),\n current = toreal(customMeasurements.current),\n ['limit'] = toreal(customMeasurements.['limit']),\n subscription = tostring(customDimensions.subscription)\n| where resource != 'NetworkWatchers'\n| where location == 'westus'\n| where subscription in (\"cab65fc3-d077-467d-931f-3932eabf36d3\", \"68672ab8-de0c-40f1-8d1b-ffb20bd62c0f\")\n| extend ['percent of limit'] = iff(['limit'] == 0, 0.0, current/['limit'] * 100)\n| summarize ['percent of limit'] = max(['percent of limit']) by bin(timestamp, $__interval), resource\n| order by timestamp asc, resource", - "resource": "[parameter(dotnet-eng-appinsights-resourcepath)]", - "resultFormat": "time_series", - "workspace": "[parameter(default-workspace-resourcepath)]" - }, - "azureMonitor": { - "dimensionFilter": "*", - "dimensionFilters": [], - "timeGrain": "auto", - "top": "10" + "uid": "quota-westus", + "title": "Azure quota usage for west us", + "condition": "C", + "data": [ + { + "refId": "A", + "queryType": "Azure Log Analytics", + "datasourceUid": "F2XodEi7z", + "model": { + "azureLogAnalytics": { + "query": "customEvents\n| where $__timeFilter(timestamp)\n| where name == 'AzureResourceUsage'\n| project\n timestamp,\n resource = tostring(customDimensions.name),\n location = tostring(customDimensions.location),\n current = toreal(customMeasurements.current),\n ['limit'] = toreal(customMeasurements.['limit']),\n subscription = tostring(customDimensions.subscription)\n| where resource != 'NetworkWatchers'\n| where location == 'westus'\n| where subscription in (\"cab65fc3-d077-467d-931f-3932eabf36d3\", \"68672ab8-de0c-40f1-8d1b-ffb20bd62c0f\")\n| extend ['percent of limit'] = iff(['limit'] == 0, 0.0, current/['limit'] * 100)\n| summarize ['percent of limit'] = max(['percent of limit']) by bin(timestamp, $__interval), resource\n| order by timestamp asc, resource", + "resource": "[parameter(dotnet-eng-appinsights-resourcepath)]", + "resultFormat": "time_series", + "workspace": "[parameter(default-workspace-resourcepath)]" + }, + "azureMonitor": { + "dimensionFilter": "*", + "dimensionFilters": [], + "timeGrain": "auto", + "top": "10" + }, + "datasource": { + "type": "grafana-azure-monitor-datasource", + "uid": "F2XodEi7z" + }, + "queryType": "Azure Log Analytics", + "refId": "A", + "subscription": "[parameter(dotnet-eng-appinsights-subscriptionid)]", + "subscriptions": [ + "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", + "cab65fc3-d077-467d-931f-3932eabf36d3" + ] + }, + "relativeTimeRange": { + "from": 86400, + "to": 0 + } }, - "datasource": { - "type": "grafana-azure-monitor-datasource", - "uid": "F2XodEi7z" + { + "refId": "B", + "queryType": "", + "datasourceUid": "-100", + "model": { + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "A", + "reducer": "mean", + "refId": "B", + "type": "reduce" + }, + "relativeTimeRange": { + "from": 300, + "to": 0 + } }, - "queryType": "Azure Log Analytics", - "refId": "A", - "subscription": "[parameter(dotnet-eng-appinsights-subscriptionid)]", - "subscriptions": [ - "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", - "cab65fc3-d077-467d-931f-3932eabf36d3" - ] - }, - "relativeTimeRange": { - "from": 86400, - "to": 0 - } + { + "refId": "C", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [ + 95 + ], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "B" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "B", + "refId": "C", + "type": "threshold" + } + } + ], + "noDataState": "KeepLast", + "execErrState": "KeepLast", + "for": "5m", + "frequency": "1m", + "annotations": { + "__dashboardUid__": "quota", + "__panelId__": "12", + "description": "An Azure Resource Quota is nearing its limit in region westus!" }, - { - "refId": "B", - "queryType": "", - "datasourceUid": "-100", - "model": { - "datasource": { - "type": "__expr__", - "uid": "-100" - }, - "expression": "A", - "reducer": "mean", - "refId": "B", - "type": "reduce" - }, - "relativeTimeRange": { - "from": 300, - "to": 0 - } + "labels": { + "NotificationId": "e2be2ec3e22e46d28730bab54ff8fa77" }, - { - "refId": "C", - "queryType": "", - "datasourceUid": "-100", - "model": { - "conditions": [ - { - "evaluator": { - "params": [95], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "query": { - "params": ["B"] - }, - "type": "query" - } - ], - "datasource": { - "type": "__expr__", - "uid": "-100" - }, - "expression": "B", - "refId": "C", - "type": "threshold" - } + "__dashboardUid__": "quota", + "__panelId__": "12", + "folderUID": "arcade-services", + "ruleGroup": "Azure Quota Alerts", + "intervalMs": 900000, + "isPaused": false, + "notification_settings": { + "receiver": ".NET Status Alert", + "group_wait": "5m", + "repeat_interval": "4h" } - ], - "noDataState": "KeepLast", - "execErrState": "KeepLast", - "for": "5m", - "frequency": "1m", - "annotations": { - "description": "An Azure Resource Quota is nearing its limit in region westus!" - }, - "labels": { - "NotificationId": "e2be2ec3e22e46d28730bab54ff8fa77" - }, - "folderUID": "arcade-services", - "ruleGroup": "Azure Quota Alerts", - "intervalMs": 900000, - "isPaused": false, - "notification_settings": { - "receiver": ".NET Status Alert", - "group_wait": "5m", - "repeat_interval": "4h" - } -} +} \ No newline at end of file diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/quota-westus2.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/quota-westus2.alert.json index 09a010687..6204af829 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/quota-westus2.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/quota-westus2.alert.json @@ -91,11 +91,15 @@ "for": "5m", "frequency": "1m", "annotations": { + "__dashboardUid__": "quota", + "__panelId__": "13", "description": "An Azure Resource Quota is nearing its limit in region westus2!" }, "labels": { "NotificationId": "44aff3c937c042caa09f821ae923c26c" }, + "__dashboardUid__": "quota", + "__panelId__": "13", "folderUID": "arcade-services", "ruleGroup": "Azure Quota Alerts", "intervalMs": 900000, diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/source-dot-net-availability.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/source-dot-net-availability.alert.json index 11ff1f0f1..205291f2b 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/source-dot-net-availability.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/source-dot-net-availability.alert.json @@ -136,11 +136,15 @@ "for": "15m", "frequency": "1m", "annotations": { + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "16", "description": "source.dot.net availability is low!" }, "labels": { "NotificationId": "fb8faaf7600740f98a1c2db076cd1712" }, + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "16", "folderUID": "arcade-services", "ruleGroup": "Source Browser Alerts", "intervalSeconds": 900000, @@ -150,4 +154,4 @@ "group_wait": "5m", "repeat_interval": "4h" } -} +} \ No newline at end of file diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/work-items-waiting-time-build-pools.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/work-items-waiting-time-build-pools.alert.json index 091d30cca..8a952b6ae 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/work-items-waiting-time-build-pools.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/work-items-waiting-time-build-pools.alert.json @@ -1,167 +1,171 @@ { - "uid": "work-items-waiting-time-build-pools", - "title": "Work Items Waiting Time Is Too High (Build Pools)", - "condition": "D", - "data": [ - { - "refId": "A", - "queryType": "", - "datasourceUid": "OlcfOPi7z", - "relativeTimeRange": { - "from": 86400, - "to": 0 - }, - "model": { - "database": "engineeringdata", - "datasource": { - "type": "grafana-azure-data-explorer-datasource", - "uid": "OlcfOPi7z" - }, - "expression": { - "groupBy": { - "expressions": [], - "type": "and" - }, - "reduce": { - "expressions": [], - "type": "and" - }, - "where": { - "expressions": [], - "type": "and" - } - }, - "query": "WorkItems \n| where $__timeFilter(Queued)\n| project QueueName = tolower(QueueName), duration = datetime_diff('second', Started, Queued), Queued\n| where QueueName !contains \"osx\" and QueueName !contains \"perf\" and QueueName !contains \"arm\" and QueueName !contains \"arcade\" and QueueName !contains \"xaml\" and QueueName !contains \"appcompat\"\n| where QueueName contains \"buildpool\"\n| summarize Percentile95 = percentile(duration, 95) / 60 by bin(Queued, 1d)\n| order by Queued asc ", - "querySource": "raw", - "rawMode": true, - "refId": "A", - "resultFormat": "table" - } - }, - { - "refId": "B", - "queryType": "", - "datasourceUid": "OlcfOPi7z", - "relativeTimeRange": { - "from": 86400, - "to": 0 - }, - "model": { - "database": "engineeringdata", - "datasource": { - "type": "grafana-azure-data-explorer-datasource", - "uid": "OlcfOPi7z" - }, - "expression": { - "groupBy": { - "expressions": [], - "type": "and" - }, - "reduce": { - "expressions": [], - "type": "and" - }, - "where": { - "expressions": [], - "type": "and" - } - }, - "query": "WorkItems \n| where $__timeFilter(Queued)\n| project QueueName = tolower(QueueName), duration = datetime_diff('second', Started, Queued), Queued\n| where QueueName !contains \"osx\" and QueueName !contains \"perf\" and QueueName !contains \"arm\" and QueueName !contains \"arcade\" and QueueName !contains \"xaml\" and QueueName !contains \"appcompat\"\n| where QueueName contains \"buildpool\"\n| summarize Percentile50 = percentile(duration, 50) / 60 by bin(Queued, 1d)\n| order by Queued asc ", - "querySource": "raw", - "rawMode": true, - "refId": "B", - "resultFormat": "table" - } - }, - { - "refId": "C", - "queryType": "", - "datasourceUid": "-100", - "model": { - "conditions": [ - { - "evaluator": { - "params": [], - "type": "gt" - }, - "operator": { - "type": "and" + "uid": "work-items-waiting-time-build-pools", + "title": "Work Items Waiting Time Is Too High (Build Pools)", + "condition": "D", + "data": [ + { + "refId": "A", + "queryType": "", + "datasourceUid": "OlcfOPi7z", + "relativeTimeRange": { + "from": 86400, + "to": 0 }, - "query": { - "params": [ - "A", - "5m", - "now" - ] - }, - "type": "query" - } - ], - "datasource": { - "type": "__expr__", - "uid": "-100" + "model": { + "database": "engineeringdata", + "datasource": { + "type": "grafana-azure-data-explorer-datasource", + "uid": "OlcfOPi7z" + }, + "expression": { + "groupBy": { + "expressions": [], + "type": "and" + }, + "reduce": { + "expressions": [], + "type": "and" + }, + "where": { + "expressions": [], + "type": "and" + } + }, + "query": "WorkItems \n| where $__timeFilter(Queued)\n| project QueueName = tolower(QueueName), duration = datetime_diff('second', Started, Queued), Queued\n| where QueueName !contains \"osx\" and QueueName !contains \"perf\" and QueueName !contains \"arm\" and QueueName !contains \"arcade\" and QueueName !contains \"xaml\" and QueueName !contains \"appcompat\"\n| where QueueName contains \"buildpool\"\n| summarize Percentile95 = percentile(duration, 95) / 60 by bin(Queued, 1d)\n| order by Queued asc ", + "querySource": "raw", + "rawMode": true, + "refId": "A", + "resultFormat": "table" + } }, - "expression": "B", - "reducer": "min", - "refId": "C", - "type": "reduce" - }, - "relativeTimeRange": { - "from": 300, - "to": 0 - } - }, - { - "refId": "D", - "queryType": "", - "datasourceUid": "-100", - "model": { - "conditions": [ - { - "evaluator": { - "params": [ - 30 - ], - "type": "gt" - }, - "operator": { - "type": "and" + { + "refId": "B", + "queryType": "", + "datasourceUid": "OlcfOPi7z", + "relativeTimeRange": { + "from": 86400, + "to": 0 }, - "query": { - "params": [ - "C" - ] + "model": { + "database": "engineeringdata", + "datasource": { + "type": "grafana-azure-data-explorer-datasource", + "uid": "OlcfOPi7z" + }, + "expression": { + "groupBy": { + "expressions": [], + "type": "and" + }, + "reduce": { + "expressions": [], + "type": "and" + }, + "where": { + "expressions": [], + "type": "and" + } + }, + "query": "WorkItems \n| where $__timeFilter(Queued)\n| project QueueName = tolower(QueueName), duration = datetime_diff('second', Started, Queued), Queued\n| where QueueName !contains \"osx\" and QueueName !contains \"perf\" and QueueName !contains \"arm\" and QueueName !contains \"arcade\" and QueueName !contains \"xaml\" and QueueName !contains \"appcompat\"\n| where QueueName contains \"buildpool\"\n| summarize Percentile50 = percentile(duration, 50) / 60 by bin(Queued, 1d)\n| order by Queued asc ", + "querySource": "raw", + "rawMode": true, + "refId": "B", + "resultFormat": "table" + } + }, + { + "refId": "C", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "A", + "5m", + "now" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "B", + "reducer": "min", + "refId": "C", + "type": "reduce" }, - "type": "query" - } - ], - "datasource": { - "type": "__expr__", - "uid": "-100" + "relativeTimeRange": { + "from": 300, + "to": 0 + } }, - "expression": "C", - "refId": "D", - "type": "threshold" - } + { + "refId": "D", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [ + 30 + ], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "C" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "C", + "refId": "D", + "type": "threshold" + } + } + ], + "noDataState": "OK", + "execErrState": "Alerting", + "for": "5m", + "frequency": "5m", + "annotations": { + "__dashboardUid__": "home", + "__panelId__": "4", + "description": "95 percentile of work item waiting times is over 30 minutes. BuildPool queues only." + }, + "labels": { + "NotificationId": "work-items-waiting-time-build-pools" + }, + "__dashboardUid__": "home", + "__panelId__": "4", + "folderUID": "arcade-services", + "ruleGroup": "Helix Queue Alerts", + "intervalMs": 300000, + "isPaused": false, + "notification_settings": { + "receiver": ".NET Status Alert", + "group_wait": "5m", + "repeat_interval": "4h" } - ], - "noDataState": "OK", - "execErrState": "Alerting", - "for": "5m", - "frequency": "5m", - "annotations": { - "description": "95 percentile of work item waiting times is over 30 minutes. BuildPool queues only." - }, - "labels": { - "NotificationId": "work-items-waiting-time-build-pools" - }, - "folderUID": "arcade-services", - "ruleGroup": "Helix Queue Alerts", - "intervalMs": 300000, - "isPaused": false, - "notification_settings": { - "receiver": ".NET Status Alert", - "group_wait": "5m", - "repeat_interval": "4h" - } -} +} \ No newline at end of file diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/work-items-waiting-time-test-queues.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/work-items-waiting-time-test-queues.alert.json index 24736ef49..bc9c62280 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/work-items-waiting-time-test-queues.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/work-items-waiting-time-test-queues.alert.json @@ -150,11 +150,15 @@ "for": "5m", "frequency": "5m", "annotations": { + "__dashboardUid__": "home", + "__panelId__": "10", "description": "95 percentile of work item waiting times is over 35 minutes. Test queues only." }, "labels": { "NotificationId": "work-items-waiting-time-test-queues" }, + "__dashboardUid__": "home", + "__panelId__": "10", "folderUID": "arcade-services", "ruleGroup": "Helix Queue Alerts", "intervalMs": 300000, diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/cores-consumption.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/cores-consumption.alert.json index 3450d1b7b..72d8d8783 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/cores-consumption.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/cores-consumption.alert.json @@ -14,7 +14,9 @@ "model": { "azureLogAnalytics": { "query": "let quotaPerSubscription = customEvents \n| where $__timeFilter(timestamp)\n| where name == \"AzureSubscriptionQuotaLimit\"\n| project \n quota = toint(customMeasurements.quota),\n subscription = tostring(customDimensions.subscriptionId),\n timestamp\n| summarize arg_max(timestamp, quota) by subscription\n| project quota, subscription;\ncustomEvents\n| where $__timeFilter(timestamp)\n| where name == 'AzureResourceUsage'\n| where customDimensions.name == \"standardDv3Family\" or customDimensions.name == \"standardDAv4Family\"\n| project \n cores = toreal(customMeasurements.current),\n subscription = tostring(customDimensions.subscription),\n timestamp\n| join kind=inner quotaPerSubscription on subscription\n| project ['limit'] = quota, cores, timestamp, subscription\n| extend ['percent of limit'] = iff(['limit'] == 0, 0.0, cores/['limit'] * 100)\n| summarize ['percent of limit'] = max(['percent of limit']) by bin(timestamp, $__interval), subscription\n| order by timestamp asc", - "resource": "[parameter(dotnet-eng-appinsights-resourcepath)]", + "resources": [ + "[parameter(dotnet-eng-appinsights-resourcepath)]" + ], "resultFormat": "time_series", "workspace": "[parameter(default-workspace-resourcepath)]" }, @@ -95,11 +97,15 @@ "for": "5m", "frequency": "1m", "annotations": { + "__dashboardUid__": "quota", + "__panelId__": "15", "description": "Cores consumption by Autoscaler is above 95% of limit" }, "labels": { "NotificationId": "66b2ef8da5c74a2fbbc7d6739f55e4e8" }, + "__dashboardUid__": "quota", + "__panelId__": "15", "folderUID": "arcade-services", "ruleGroup": "Azure Quota Alerts", "intervalMs": 900000, diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/dotneteng-status-failed-requests.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/dotneteng-status-failed-requests.alert.json index e8b66ef92..1cf07a889 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/dotneteng-status-failed-requests.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/dotneteng-status-failed-requests.alert.json @@ -86,14 +86,18 @@ "conditions": [ { "evaluator": { - "params": [20], + "params": [ + 20 + ], "type": "gt" }, "operator": { "type": "and" }, "query": { - "params": ["B"] + "params": [ + "B" + ] }, "type": "query" } @@ -113,11 +117,15 @@ "for": "5m", "frequency": "1m", "annotations": { + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "44", "description": "The number of failed DotNetEng Status requests per hour is above 20. This may indicate a systemic problem that needs to be investigated.\\nTo intially investigate prod, run the following query in DotNetEng-Status-Prod, and to investigate staging, run the query in DotNetEng-Status-Staging:\\n\\n```\\nunion exceptions, traces\\n| project timestamp, operation_Name, customDimensions, message, problemId, details\\n| order by timestamp asc\\n```" }, "labels": { "NotificationId": "d2dd705a6c724ed68fcf6955561c06dd" }, + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "44", "folderUID": "arcade-services", "ruleGroup": "DotNetEng Status Alerts", "intervalMs": 900000, @@ -127,4 +135,4 @@ "group_wait": "5m", "repeat_interval": "4h" } -} +} \ No newline at end of file diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/helix-api-availability.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/helix-api-availability.alert.json index 85e307d34..266e38272 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/helix-api-availability.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/helix-api-availability.alert.json @@ -62,7 +62,7 @@ "subscriptions": [ "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", "cab65fc3-d077-467d-931f-3932eabf36d3" - ] + ] } }, { @@ -142,11 +142,15 @@ "for": "5m", "frequency": "1m", "annotations": { + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "18", "description": "Helix API availability alert!" }, "labels": { "NotificationId": "6179576701874a7abc440a574cf636d0" }, + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "18", "folderUID": "arcade-services", "ruleGroup": "Helix Alerts", "intervalMs": 900000, @@ -156,4 +160,4 @@ "group_wait": "5m", "repeat_interval": "4h" } -} +} \ No newline at end of file diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/helix-api-average-response-time.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/helix-api-average-response-time.alert.json index b3cfd67b6..5501e3377 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/helix-api-average-response-time.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/helix-api-average-response-time.alert.json @@ -131,9 +131,13 @@ "for": "5m", "frequency": "1m", "annotations": { + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "19", "description": "Helix API Average Response Time is high!" }, "labels": { + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "19", "NotificationId": "24cae10d9eca44079e7cf3d47f148497" }, "folderUID": "arcade-services", diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/helix-autoscaler-service-stopped.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/helix-autoscaler-service-stopped.alert.json index cb3c942e3..ee7b4c4d3 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/helix-autoscaler-service-stopped.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/helix-autoscaler-service-stopped.alert.json @@ -139,11 +139,15 @@ "for": "5m", "frequency": "1m", "annotations": { + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "29", "description": "Helix AutoScaler Service has stopped running - no traces detected in the last 30 minutes." }, "labels": { "NotificationId": "6213d3c5ce9a46278343bf075798e46f" }, + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "29", "folderUID": "arcade-services", "ruleGroup": "Helix Alerts", "intervalMs": 900000, @@ -153,4 +157,4 @@ "group_wait": "5m", "repeat_interval": "4h" } -} +} \ No newline at end of file diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-background-worker-stopped.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-background-worker-stopped.alert.json index 082c7bfbc..718136d73 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-background-worker-stopped.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-background-worker-stopped.alert.json @@ -112,14 +112,18 @@ "conditions": [ { "evaluator": { - "params": [20], + "params": [ + 20 + ], "type": "lt" }, "operator": { "type": "and" }, "query": { - "params": ["C"] + "params": [ + "C" + ] }, "type": "query" } @@ -139,11 +143,15 @@ "for": "5m", "frequency": "1m", "annotations": { + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "57", "description": "[!IMPORTANT]\n[Description and instructions for this alert](https://dev.azure.com/dnceng/internal/_wiki/wikis/DNCEng%20Services%20Wiki/1308/-Alert-PCS-Background-Worker-Stopped)\n\nPCS appears to have stopped processing new WorkItems.\n\n@dotnet/prodconsvcs" }, "labels": { "NotificationId": "23909d48866646408f669cc1c3d325ee" }, + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "57", "folderUID": "arcade-services", "ruleGroup": "PCS Alerts", "intervalMs": 900000, @@ -153,4 +161,4 @@ "group_wait": "5m", "repeat_interval": "4h" } -} +} \ No newline at end of file diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-container-job-execution-failures.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-container-job-execution-failures.alert.json index 1a18b0ff1..aa979ab79 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-container-job-execution-failures.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-container-job-execution-failures.alert.json @@ -122,11 +122,15 @@ "for": "5m", "frequency": "1m", "annotations": { + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "74", "description": "[Description and instructions for this alert](https://dev.azure.com/dnceng/internal/_wiki/wikis/DNCEng%20Services%20Wiki/1350/-Alert-PCS-container-job-execution-failing)\\n\\nPlease note that this alert will fire every 12 hours as the list of failed jobs can change" }, "labels": { "NotificationId": "0a5c68b0daf846ef83a66c6c70fd24ad" }, + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "74", "folderUID": "arcade-services", "ruleGroup": "PCS Alerts", "intervalMs": 900000, diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-disk-space-issues.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-disk-space-issues.alert.json index 12adcd112..fc8ac0dea 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-disk-space-issues.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-disk-space-issues.alert.json @@ -111,11 +111,15 @@ "for": "5m", "frequency": "5m", "annotations": { + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "72", "description": "[!IMPORTANT]\n[Description and instructions for this alert](https://dev.azure.com/dnceng/internal/_wiki/wikis/DNCEng%20Services%20Wiki/1346/-Alert-PCS-Disk-Space-Issues)\n\nThe PCS service is running out of disk space.\n\n@dotnet/prodconsvcs" }, "labels": { "NotificationId": "aa1fe025a8954b6cad9866354ca041ee" }, + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "72", "folderUID": "arcade-services", "ruleGroup": "PCS Alerts", "intervalMs": 900000, diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-exceptions-high.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-exceptions-high.alert.json index 9a9cc78e7..cb6ecb002 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-exceptions-high.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-exceptions-high.alert.json @@ -84,14 +84,18 @@ "conditions": [ { "evaluator": { - "params": [15], + "params": [ + 15 + ], "type": "gt" }, "operator": { "type": "and" }, "query": { - "params": ["B"] + "params": [ + "B" + ] }, "type": "query" } @@ -111,11 +115,15 @@ "for": "5m", "frequency": "1m", "annotations": { + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "46", "description": "[!IMPORTANT]\n[Description and instructions for this alert](https://dev.azure.com/dnceng/internal/_wiki/wikis/DNCEng%20Services%20Wiki/1311/-Alert-PCS-Exceptions-High)\n\nThe PCS background work items started to fail frequently.\n\n@dotnet/prodconsvcs" }, "labels": { "NotificationId": "08f669cc1c3d325ee488666464" }, + "dashboardUid": "arcadeAvailability", + "panelId": "46", "folderUID": "arcade-services", "ruleGroup": "PCS Alerts", "intervalMs": 900000, @@ -125,4 +133,4 @@ "group_wait": "5m", "repeat_interval": "4h" } -} +} \ No newline at end of file diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-git-push-success-rate.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-git-push-success-rate.alert.json index 9fa19d01b..603ee2aa6 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-git-push-success-rate.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-git-push-success-rate.alert.json @@ -84,14 +84,18 @@ "conditions": [ { "evaluator": { - "params": [80], + "params": [ + 80 + ], "type": "lt" }, "operator": { "type": "and" }, "query": { - "params": ["B"] + "params": [ + "B" + ] }, "type": "query" } @@ -111,11 +115,15 @@ "for": "5m", "frequency": "1m", "annotations": { + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "75", "description": "[!IMPORTANT]\\n[Description and instructions for this alert](https://dev.azure.com/dnceng/internal/_wiki/wikis/DNCEng%20Services%20Wiki/1318/-Alert-PCS-high-git-push-failure-rate)\\n\\nPCS has a high `git push` failure rate, please investigate\\n\\n@dotnet/prodconsvcs" }, "labels": { "NotificationId": "6ggqnvwrunnru1zfl4g42dn9qjzanb8a" }, + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "75", "folderUID": "arcade-services", "ruleGroup": "PCS Alerts", "intervalSeconds": 60, @@ -125,4 +133,4 @@ "group_wait": "5m", "repeat_interval": "4h" } -} +} \ No newline at end of file diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-work-item-success-rate.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-work-item-success-rate.alert.json index 6e11755a7..61290ac69 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-work-item-success-rate.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-work-item-success-rate.alert.json @@ -151,11 +151,15 @@ "for": "5m", "frequency": "1m", "annotations": { + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "64", "description": "[!IMPORTANT]\n[Description and instructions for this alert](https://dev.azure.com/dnceng/internal/_wiki/wikis/DNCEng%20Services%20Wiki/1310/-Alert-PCS-Work-Item-Success-Rate-alert)\n\nThe PCS background work items started to fail frequently.\n\n@dotnet/prodconsvcs" }, "labels": { "NotificationId": "d71fe025a8954b6cad9866354ca041ee" }, + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "64", "folderUID": "arcade-services", "ruleGroup": "PCS Alerts", "intervalMs": 900000, diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/quota-eastus.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/quota-eastus.alert.json index 0c8dd6207..ebdc0ffd0 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/quota-eastus.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/quota-eastus.alert.json @@ -1,127 +1,135 @@ { - "uid": "quota-eastus", - "title": "Azure quota usage for east us", - "condition": "C", - "data": [ - { - "refId": "A", - "queryType": "Azure Log Analytics", - "datasourceUid": "F2XodEi7z", - "model": { - "azureLogAnalytics": { - "query": "customEvents\n| where $__timeFilter(timestamp)\n| where name == 'AzureResourceUsage'\n| project\n timestamp,\n resource = tostring(customDimensions.name),\n location = tostring(customDimensions.location),\n current = toreal(customMeasurements.current),\n ['limit'] = toreal(customMeasurements.['limit']),\n subscription = tostring(customDimensions.subscription)\n| where resource != 'NetworkWatchers'\n| where location == 'eastus'\n| where subscription in (\"cab65fc3-d077-467d-931f-3932eabf36d3\", \"68672ab8-de0c-40f1-8d1b-ffb20bd62c0f\")\n| extend ['percent of limit'] = iff(['limit'] == 0, 0.0, current/['limit'] * 100)\n| summarize ['percent of limit'] = max(['percent of limit']) by bin(timestamp, $__interval), resource\n| order by timestamp asc, resource", - "resource": "[parameter(dotnet-eng-appinsights-resourcepath)]", - "resultFormat": "time_series", - "workspace": "[parameter(default-workspace-resourcepath)]" - }, - "azureMonitor": { - "dimensionFilter": "*", - "dimensionFilters": [], - "timeGrain": "auto", - "top": "10" - }, - "datasource": { - "type": "grafana-azure-monitor-datasource", - "uid": "F2XodEi7z" - }, - "queryType": "Azure Log Analytics", - "refId": "A", - "subscription": "[parameter(dotnet-eng-appinsights-subscriptionid)]", - "subscriptions": [ - "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", - "cab65fc3-d077-467d-931f-3932eabf36d3" - ] - }, - "relativeTimeRange": { - "from": 86400, - "to": 0 - } - }, - { - "refId": "B", - "queryType": "", - "datasourceUid": "-100", - "model": { - "conditions": [ - { - "evaluator": { - "params": [], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "query": { - "params": [ - "A", - "5m", - "now" - ] + "uid": "quota-eastus", + "title": "Azure quota usage for east us", + "condition": "C", + "data": [ + { + "refId": "A", + "queryType": "Azure Log Analytics", + "datasourceUid": "F2XodEi7z", + "model": { + "azureLogAnalytics": { + "query": "customEvents\n| where $__timeFilter(timestamp)\n| where name == 'AzureResourceUsage'\n| project\n timestamp,\n resource = tostring(customDimensions.name),\n location = tostring(customDimensions.location),\n current = toreal(customMeasurements.current),\n ['limit'] = toreal(customMeasurements.['limit']),\n subscription = tostring(customDimensions.subscription)\n| where resource != 'NetworkWatchers'\n| where location == 'eastus'\n| where subscription in (\"cab65fc3-d077-467d-931f-3932eabf36d3\", \"68672ab8-de0c-40f1-8d1b-ffb20bd62c0f\")\n| extend ['percent of limit'] = iff(['limit'] == 0, 0.0, current/['limit'] * 100)\n| summarize ['percent of limit'] = max(['percent of limit']) by bin(timestamp, $__interval), resource\n| order by timestamp asc, resource", + "resource": "[parameter(dotnet-eng-appinsights-resourcepath)]", + "resultFormat": "time_series", + "workspace": "[parameter(default-workspace-resourcepath)]" + }, + "azureMonitor": { + "dimensionFilter": "*", + "dimensionFilters": [], + "timeGrain": "auto", + "top": "10" + }, + "datasource": { + "type": "grafana-azure-monitor-datasource", + "uid": "F2XodEi7z" + }, + "queryType": "Azure Log Analytics", + "refId": "A", + "subscription": "[parameter(dotnet-eng-appinsights-subscriptionid)]", + "subscriptions": [ + "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", + "cab65fc3-d077-467d-931f-3932eabf36d3" + ] }, - "type": "query" - } - ], - "datasource": { - "type": "__expr__", - "uid": "-100" + "relativeTimeRange": { + "from": 86400, + "to": 0 + } }, - "expression": "A", - "reducer": "mean", - "refId": "B", - "type": "reduce" - }, - "relativeTimeRange": { - "from": 300, - "to": 0 - } - }, - { - "refId": "C", - "queryType": "", - "datasourceUid": "-100", - "model": { - "conditions": [ - { - "evaluator": { - "params": [95], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "query": { - "params": ["B"] + { + "refId": "B", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "A", + "5m", + "now" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "A", + "reducer": "mean", + "refId": "B", + "type": "reduce" }, - "type": "query" - } - ], - "datasource": { - "type": "__expr__", - "uid": "-100" + "relativeTimeRange": { + "from": 300, + "to": 0 + } }, - "expression": "B", - "refId": "C", - "type": "threshold" - } + { + "refId": "C", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [ + 95 + ], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "B" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "B", + "refId": "C", + "type": "threshold" + } + } + ], + "noDataState": "KeepLast", + "execErrState": "KeepLast", + "for": "5m", + "frequency": "1m", + "annotations": { + "__dashboardUid__": "quota", + "__panelId__": "24", + "description": "An Azure Resource Quota is nearing its limit in region eastus!" + }, + "labels": { + "NotificationId": "b50b57fa7d1840438da5232711af4485" + }, + "__dashboardUid__": "quota", + "__panelId__": "24", + "folderUID": "arcade-services", + "ruleGroup": "Azure Quota Alerts", + "intervalMs": 900000, + "isPaused": false, + "notification_settings": { + "receiver": ".NET Status Alert", + "group_wait": "5m", + "repeat_interval": "4h" } - ], - "noDataState": "KeepLast", - "execErrState": "KeepLast", - "for": "5m", - "frequency": "1m", - "annotations": { - "description": "An Azure Resource Quota is nearing its limit in region eastus!" - }, - "labels": { - "NotificationId": "b50b57fa7d1840438da5232711af4485" - }, - "folderUID": "arcade-services", - "ruleGroup": "Azure Quota Alerts", - "intervalMs": 900000, - "isPaused": false, - "notification_settings": { - "receiver": ".NET Status Alert", - "group_wait": "5m", - "repeat_interval": "4h" - } -} +} \ No newline at end of file diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/quota-westus.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/quota-westus.alert.json index ef697622f..beea05ea9 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/quota-westus.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/quota-westus.alert.json @@ -1,108 +1,116 @@ { - "uid": "quota-westus", - "title": "Azure quota usage for west us", - "condition": "C", - "data": [ - { - "refId": "A", - "queryType": "Azure Log Analytics", - "datasourceUid": "F2XodEi7z", - "model": { - "azureLogAnalytics": { - "query": "customEvents\n| where $__timeFilter(timestamp)\n| where name == 'AzureResourceUsage'\n| project\n timestamp,\n resource = tostring(customDimensions.name),\n location = tostring(customDimensions.location),\n current = toreal(customMeasurements.current),\n ['limit'] = toreal(customMeasurements.['limit']),\n subscription = tostring(customDimensions.subscription)\n| where resource != 'NetworkWatchers'\n| where location == 'westus'\n| where subscription in (\"cab65fc3-d077-467d-931f-3932eabf36d3\", \"68672ab8-de0c-40f1-8d1b-ffb20bd62c0f\")\n| extend ['percent of limit'] = iff(['limit'] == 0, 0.0, current/['limit'] * 100)\n| summarize ['percent of limit'] = max(['percent of limit']) by bin(timestamp, $__interval), resource\n| order by timestamp asc, resource", - "resource": "[parameter(dotnet-eng-appinsights-resourcepath)]", - "resultFormat": "time_series", - "workspace": "[parameter(default-workspace-resourcepath)]" - }, - "azureMonitor": { - "dimensionFilter": "*", - "dimensionFilters": [], - "timeGrain": "auto", - "top": "10" + "uid": "quota-westus", + "title": "Azure quota usage for west us", + "condition": "C", + "data": [ + { + "refId": "A", + "queryType": "Azure Log Analytics", + "datasourceUid": "F2XodEi7z", + "model": { + "azureLogAnalytics": { + "query": "customEvents\n| where $__timeFilter(timestamp)\n| where name == 'AzureResourceUsage'\n| project\n timestamp,\n resource = tostring(customDimensions.name),\n location = tostring(customDimensions.location),\n current = toreal(customMeasurements.current),\n ['limit'] = toreal(customMeasurements.['limit']),\n subscription = tostring(customDimensions.subscription)\n| where resource != 'NetworkWatchers'\n| where location == 'westus'\n| where subscription in (\"cab65fc3-d077-467d-931f-3932eabf36d3\", \"68672ab8-de0c-40f1-8d1b-ffb20bd62c0f\")\n| extend ['percent of limit'] = iff(['limit'] == 0, 0.0, current/['limit'] * 100)\n| summarize ['percent of limit'] = max(['percent of limit']) by bin(timestamp, $__interval), resource\n| order by timestamp asc, resource", + "resource": "[parameter(dotnet-eng-appinsights-resourcepath)]", + "resultFormat": "time_series", + "workspace": "[parameter(default-workspace-resourcepath)]" + }, + "azureMonitor": { + "dimensionFilter": "*", + "dimensionFilters": [], + "timeGrain": "auto", + "top": "10" + }, + "datasource": { + "type": "grafana-azure-monitor-datasource", + "uid": "F2XodEi7z" + }, + "queryType": "Azure Log Analytics", + "refId": "A", + "subscription": "[parameter(dotnet-eng-appinsights-subscriptionid)]", + "subscriptions": [ + "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", + "cab65fc3-d077-467d-931f-3932eabf36d3" + ] + }, + "relativeTimeRange": { + "from": 86400, + "to": 0 + } }, - "datasource": { - "type": "grafana-azure-monitor-datasource", - "uid": "F2XodEi7z" + { + "refId": "B", + "queryType": "", + "datasourceUid": "-100", + "model": { + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "A", + "reducer": "mean", + "refId": "B", + "type": "reduce" + }, + "relativeTimeRange": { + "from": 300, + "to": 0 + } }, - "queryType": "Azure Log Analytics", - "refId": "A", - "subscription": "[parameter(dotnet-eng-appinsights-subscriptionid)]", - "subscriptions": [ - "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", - "cab65fc3-d077-467d-931f-3932eabf36d3" - ] - }, - "relativeTimeRange": { - "from": 86400, - "to": 0 - } + { + "refId": "C", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [ + 95 + ], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "B" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "B", + "refId": "C", + "type": "threshold" + } + } + ], + "noDataState": "KeepLast", + "execErrState": "KeepLast", + "for": "5m", + "frequency": "1m", + "annotations": { + "__dashboardUid__": "quota", + "__panelId__": "12", + "description": "An Azure Resource Quota is nearing its limit in region westus!" }, - { - "refId": "B", - "queryType": "", - "datasourceUid": "-100", - "model": { - "datasource": { - "type": "__expr__", - "uid": "-100" - }, - "expression": "A", - "reducer": "mean", - "refId": "B", - "type": "reduce" - }, - "relativeTimeRange": { - "from": 300, - "to": 0 - } + "labels": { + "NotificationId": "e2be2ec3e22e46d28730bab54ff8fa77" }, - { - "refId": "C", - "queryType": "", - "datasourceUid": "-100", - "model": { - "conditions": [ - { - "evaluator": { - "params": [95], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "query": { - "params": ["B"] - }, - "type": "query" - } - ], - "datasource": { - "type": "__expr__", - "uid": "-100" - }, - "expression": "B", - "refId": "C", - "type": "threshold" - } + "__dashboardUid__": "quota", + "__panelId__": "12", + "folderUID": "arcade-services", + "ruleGroup": "Azure Quota Alerts", + "intervalMs": 900000, + "isPaused": false, + "notification_settings": { + "receiver": ".NET Status Alert", + "group_wait": "5m", + "repeat_interval": "4h" } - ], - "noDataState": "KeepLast", - "execErrState": "KeepLast", - "for": "5m", - "frequency": "1m", - "annotations": { - "description": "An Azure Resource Quota is nearing its limit in region westus!" - }, - "labels": { - "NotificationId": "e2be2ec3e22e46d28730bab54ff8fa77" - }, - "folderUID": "arcade-services", - "ruleGroup": "Azure Quota Alerts", - "intervalMs": 900000, - "isPaused": false, - "notification_settings": { - "receiver": ".NET Status Alert", - "group_wait": "5m", - "repeat_interval": "4h" - } -} +} \ No newline at end of file diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/quota-westus2.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/quota-westus2.alert.json index 09a010687..6204af829 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/quota-westus2.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/quota-westus2.alert.json @@ -91,11 +91,15 @@ "for": "5m", "frequency": "1m", "annotations": { + "__dashboardUid__": "quota", + "__panelId__": "13", "description": "An Azure Resource Quota is nearing its limit in region westus2!" }, "labels": { "NotificationId": "44aff3c937c042caa09f821ae923c26c" }, + "__dashboardUid__": "quota", + "__panelId__": "13", "folderUID": "arcade-services", "ruleGroup": "Azure Quota Alerts", "intervalMs": 900000, diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/source-dot-net-availability.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/source-dot-net-availability.alert.json index 11ff1f0f1..205291f2b 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/source-dot-net-availability.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/source-dot-net-availability.alert.json @@ -136,11 +136,15 @@ "for": "15m", "frequency": "1m", "annotations": { + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "16", "description": "source.dot.net availability is low!" }, "labels": { "NotificationId": "fb8faaf7600740f98a1c2db076cd1712" }, + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "16", "folderUID": "arcade-services", "ruleGroup": "Source Browser Alerts", "intervalSeconds": 900000, @@ -150,4 +154,4 @@ "group_wait": "5m", "repeat_interval": "4h" } -} +} \ No newline at end of file diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/work-items-waiting-time-build-pools.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/work-items-waiting-time-build-pools.alert.json index 091d30cca..8a952b6ae 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/work-items-waiting-time-build-pools.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/work-items-waiting-time-build-pools.alert.json @@ -1,167 +1,171 @@ { - "uid": "work-items-waiting-time-build-pools", - "title": "Work Items Waiting Time Is Too High (Build Pools)", - "condition": "D", - "data": [ - { - "refId": "A", - "queryType": "", - "datasourceUid": "OlcfOPi7z", - "relativeTimeRange": { - "from": 86400, - "to": 0 - }, - "model": { - "database": "engineeringdata", - "datasource": { - "type": "grafana-azure-data-explorer-datasource", - "uid": "OlcfOPi7z" - }, - "expression": { - "groupBy": { - "expressions": [], - "type": "and" - }, - "reduce": { - "expressions": [], - "type": "and" - }, - "where": { - "expressions": [], - "type": "and" - } - }, - "query": "WorkItems \n| where $__timeFilter(Queued)\n| project QueueName = tolower(QueueName), duration = datetime_diff('second', Started, Queued), Queued\n| where QueueName !contains \"osx\" and QueueName !contains \"perf\" and QueueName !contains \"arm\" and QueueName !contains \"arcade\" and QueueName !contains \"xaml\" and QueueName !contains \"appcompat\"\n| where QueueName contains \"buildpool\"\n| summarize Percentile95 = percentile(duration, 95) / 60 by bin(Queued, 1d)\n| order by Queued asc ", - "querySource": "raw", - "rawMode": true, - "refId": "A", - "resultFormat": "table" - } - }, - { - "refId": "B", - "queryType": "", - "datasourceUid": "OlcfOPi7z", - "relativeTimeRange": { - "from": 86400, - "to": 0 - }, - "model": { - "database": "engineeringdata", - "datasource": { - "type": "grafana-azure-data-explorer-datasource", - "uid": "OlcfOPi7z" - }, - "expression": { - "groupBy": { - "expressions": [], - "type": "and" - }, - "reduce": { - "expressions": [], - "type": "and" - }, - "where": { - "expressions": [], - "type": "and" - } - }, - "query": "WorkItems \n| where $__timeFilter(Queued)\n| project QueueName = tolower(QueueName), duration = datetime_diff('second', Started, Queued), Queued\n| where QueueName !contains \"osx\" and QueueName !contains \"perf\" and QueueName !contains \"arm\" and QueueName !contains \"arcade\" and QueueName !contains \"xaml\" and QueueName !contains \"appcompat\"\n| where QueueName contains \"buildpool\"\n| summarize Percentile50 = percentile(duration, 50) / 60 by bin(Queued, 1d)\n| order by Queued asc ", - "querySource": "raw", - "rawMode": true, - "refId": "B", - "resultFormat": "table" - } - }, - { - "refId": "C", - "queryType": "", - "datasourceUid": "-100", - "model": { - "conditions": [ - { - "evaluator": { - "params": [], - "type": "gt" - }, - "operator": { - "type": "and" + "uid": "work-items-waiting-time-build-pools", + "title": "Work Items Waiting Time Is Too High (Build Pools)", + "condition": "D", + "data": [ + { + "refId": "A", + "queryType": "", + "datasourceUid": "OlcfOPi7z", + "relativeTimeRange": { + "from": 86400, + "to": 0 }, - "query": { - "params": [ - "A", - "5m", - "now" - ] - }, - "type": "query" - } - ], - "datasource": { - "type": "__expr__", - "uid": "-100" + "model": { + "database": "engineeringdata", + "datasource": { + "type": "grafana-azure-data-explorer-datasource", + "uid": "OlcfOPi7z" + }, + "expression": { + "groupBy": { + "expressions": [], + "type": "and" + }, + "reduce": { + "expressions": [], + "type": "and" + }, + "where": { + "expressions": [], + "type": "and" + } + }, + "query": "WorkItems \n| where $__timeFilter(Queued)\n| project QueueName = tolower(QueueName), duration = datetime_diff('second', Started, Queued), Queued\n| where QueueName !contains \"osx\" and QueueName !contains \"perf\" and QueueName !contains \"arm\" and QueueName !contains \"arcade\" and QueueName !contains \"xaml\" and QueueName !contains \"appcompat\"\n| where QueueName contains \"buildpool\"\n| summarize Percentile95 = percentile(duration, 95) / 60 by bin(Queued, 1d)\n| order by Queued asc ", + "querySource": "raw", + "rawMode": true, + "refId": "A", + "resultFormat": "table" + } }, - "expression": "B", - "reducer": "min", - "refId": "C", - "type": "reduce" - }, - "relativeTimeRange": { - "from": 300, - "to": 0 - } - }, - { - "refId": "D", - "queryType": "", - "datasourceUid": "-100", - "model": { - "conditions": [ - { - "evaluator": { - "params": [ - 30 - ], - "type": "gt" - }, - "operator": { - "type": "and" + { + "refId": "B", + "queryType": "", + "datasourceUid": "OlcfOPi7z", + "relativeTimeRange": { + "from": 86400, + "to": 0 }, - "query": { - "params": [ - "C" - ] + "model": { + "database": "engineeringdata", + "datasource": { + "type": "grafana-azure-data-explorer-datasource", + "uid": "OlcfOPi7z" + }, + "expression": { + "groupBy": { + "expressions": [], + "type": "and" + }, + "reduce": { + "expressions": [], + "type": "and" + }, + "where": { + "expressions": [], + "type": "and" + } + }, + "query": "WorkItems \n| where $__timeFilter(Queued)\n| project QueueName = tolower(QueueName), duration = datetime_diff('second', Started, Queued), Queued\n| where QueueName !contains \"osx\" and QueueName !contains \"perf\" and QueueName !contains \"arm\" and QueueName !contains \"arcade\" and QueueName !contains \"xaml\" and QueueName !contains \"appcompat\"\n| where QueueName contains \"buildpool\"\n| summarize Percentile50 = percentile(duration, 50) / 60 by bin(Queued, 1d)\n| order by Queued asc ", + "querySource": "raw", + "rawMode": true, + "refId": "B", + "resultFormat": "table" + } + }, + { + "refId": "C", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "A", + "5m", + "now" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "B", + "reducer": "min", + "refId": "C", + "type": "reduce" }, - "type": "query" - } - ], - "datasource": { - "type": "__expr__", - "uid": "-100" + "relativeTimeRange": { + "from": 300, + "to": 0 + } }, - "expression": "C", - "refId": "D", - "type": "threshold" - } + { + "refId": "D", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [ + 30 + ], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "C" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "C", + "refId": "D", + "type": "threshold" + } + } + ], + "noDataState": "OK", + "execErrState": "Alerting", + "for": "5m", + "frequency": "5m", + "annotations": { + "__dashboardUid__": "home", + "__panelId__": "4", + "description": "95 percentile of work item waiting times is over 30 minutes. BuildPool queues only." + }, + "labels": { + "NotificationId": "work-items-waiting-time-build-pools" + }, + "__dashboardUid__": "home", + "__panelId__": "4", + "folderUID": "arcade-services", + "ruleGroup": "Helix Queue Alerts", + "intervalMs": 300000, + "isPaused": false, + "notification_settings": { + "receiver": ".NET Status Alert", + "group_wait": "5m", + "repeat_interval": "4h" } - ], - "noDataState": "OK", - "execErrState": "Alerting", - "for": "5m", - "frequency": "5m", - "annotations": { - "description": "95 percentile of work item waiting times is over 30 minutes. BuildPool queues only." - }, - "labels": { - "NotificationId": "work-items-waiting-time-build-pools" - }, - "folderUID": "arcade-services", - "ruleGroup": "Helix Queue Alerts", - "intervalMs": 300000, - "isPaused": false, - "notification_settings": { - "receiver": ".NET Status Alert", - "group_wait": "5m", - "repeat_interval": "4h" - } -} +} \ No newline at end of file diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/work-items-waiting-time-test-queues.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/work-items-waiting-time-test-queues.alert.json index 24736ef49..f958904f2 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/work-items-waiting-time-test-queues.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/work-items-waiting-time-test-queues.alert.json @@ -1,167 +1,171 @@ { - "uid": "work-items-waiting-time-test-queues", - "title": "Work Items Waiting Time Is Too High (Test Queues)", - "condition": "D", - "data": [ - { - "refId": "A", - "queryType": "", - "datasourceUid": "OlcfOPi7z", - "relativeTimeRange": { - "from": 86400, - "to": 0 - }, - "model": { - "database": "engineeringdata", - "datasource": { - "type": "grafana-azure-data-explorer-datasource", - "uid": "OlcfOPi7z" - }, - "expression": { - "groupBy": { - "expressions": [], - "type": "and" - }, - "reduce": { - "expressions": [], - "type": "and" - }, - "where": { - "expressions": [], - "type": "and" - } - }, - "query": "WorkItems \n| where $__timeFilter(Queued)\n| project QueueName = tolower(QueueName), duration = datetime_diff('second', Started, Queued), Queued\n| where QueueName !contains \"osx\" and QueueName !contains \"perf\" and QueueName !contains \"arm\" and QueueName !contains \"arcade\" and QueueName !contains \"xaml\" and QueueName !contains \"appcompat\" and QueueName !contains \".tof\"\n| where QueueName !contains \"buildpool\"\n| summarize Percentile95 = percentile(duration, 95) / 60 by bin(Queued, 1d)\n| order by Queued asc ", - "querySource": "raw", - "rawMode": true, - "refId": "A", - "resultFormat": "table" - } - }, - { - "refId": "B", - "queryType": "", - "datasourceUid": "OlcfOPi7z", - "relativeTimeRange": { - "from": 86400, - "to": 0 - }, - "model": { - "database": "engineeringdata", - "datasource": { - "type": "grafana-azure-data-explorer-datasource", - "uid": "OlcfOPi7z" - }, - "expression": { - "groupBy": { - "expressions": [], - "type": "and" - }, - "reduce": { - "expressions": [], - "type": "and" - }, - "where": { - "expressions": [], - "type": "and" - } - }, - "query": "WorkItems \n| where $__timeFilter(Queued)\n| project QueueName = tolower(QueueName), duration = datetime_diff('second', Started, Queued), Queued\n| where QueueName !contains \"osx\" and QueueName !contains \"perf\" and QueueName !contains \"arm\" and QueueName !contains \"arcade\" and QueueName !contains \"xaml\" and QueueName !contains \"appcompat\"\n| where QueueName !contains \"buildpool\"\n| summarize Percentile50 = percentile(duration, 50) / 60 by bin(Queued, 1d)\n| order by Queued asc ", - "querySource": "raw", - "rawMode": true, - "refId": "B", - "resultFormat": "table" - } - }, - { - "refId": "C", - "queryType": "", - "datasourceUid": "-100", - "model": { - "conditions": [ - { - "evaluator": { - "params": [], - "type": "gt" - }, - "operator": { - "type": "and" + "uid": "work-items-waiting-time-test-queues", + "title": "Work Items Waiting Time Is Too High (Test Queues)", + "condition": "D", + "data": [ + { + "refId": "A", + "queryType": "", + "datasourceUid": "OlcfOPi7z", + "relativeTimeRange": { + "from": 86400, + "to": 0 }, - "query": { - "params": [ - "A", - "5m", - "now" - ] - }, - "type": "query" - } - ], - "datasource": { - "type": "__expr__", - "uid": "-100" + "model": { + "database": "engineeringdata", + "datasource": { + "type": "grafana-azure-data-explorer-datasource", + "uid": "OlcfOPi7z" + }, + "expression": { + "groupBy": { + "expressions": [], + "type": "and" + }, + "reduce": { + "expressions": [], + "type": "and" + }, + "where": { + "expressions": [], + "type": "and" + } + }, + "query": "WorkItems \n| where $__timeFilter(Queued)\n| project QueueName = tolower(QueueName), duration = datetime_diff('second', Started, Queued), Queued\n| where QueueName !contains \"osx\" and QueueName !contains \"perf\" and QueueName !contains \"arm\" and QueueName !contains \"arcade\" and QueueName !contains \"xaml\" and QueueName !contains \"appcompat\" and QueueName !contains \".tof\"\n| where QueueName !contains \"buildpool\"\n| summarize Percentile95 = percentile(duration, 95) / 60 by bin(Queued, 1d)\n| order by Queued asc ", + "querySource": "raw", + "rawMode": true, + "refId": "A", + "resultFormat": "table" + } }, - "expression": "A", - "reducer": "min", - "refId": "C", - "type": "reduce" - }, - "relativeTimeRange": { - "from": 300, - "to": 0 - } - }, - { - "refId": "D", - "queryType": "", - "datasourceUid": "-100", - "model": { - "conditions": [ - { - "evaluator": { - "params": [ - 35 - ], - "type": "gt" - }, - "operator": { - "type": "and" + { + "refId": "B", + "queryType": "", + "datasourceUid": "OlcfOPi7z", + "relativeTimeRange": { + "from": 86400, + "to": 0 }, - "query": { - "params": [ - "C" - ] + "model": { + "database": "engineeringdata", + "datasource": { + "type": "grafana-azure-data-explorer-datasource", + "uid": "OlcfOPi7z" + }, + "expression": { + "groupBy": { + "expressions": [], + "type": "and" + }, + "reduce": { + "expressions": [], + "type": "and" + }, + "where": { + "expressions": [], + "type": "and" + } + }, + "query": "WorkItems \n| where $__timeFilter(Queued)\n| project QueueName = tolower(QueueName), duration = datetime_diff('second', Started, Queued), Queued\n| where QueueName !contains \"osx\" and QueueName !contains \"perf\" and QueueName !contains \"arm\" and QueueName !contains \"arcade\" and QueueName !contains \"xaml\" and QueueName !contains \"appcompat\"\n| where QueueName !contains \"buildpool\"\n| summarize Percentile50 = percentile(duration, 50) / 60 by bin(Queued, 1d)\n| order by Queued asc ", + "querySource": "raw", + "rawMode": true, + "refId": "B", + "resultFormat": "table" + } + }, + { + "refId": "C", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "A", + "5m", + "now" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "A", + "reducer": "min", + "refId": "C", + "type": "reduce" }, - "type": "query" - } - ], - "datasource": { - "type": "__expr__", - "uid": "-100" + "relativeTimeRange": { + "from": 300, + "to": 0 + } }, - "expression": "C", - "refId": "D", - "type": "threshold" - } + { + "refId": "D", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [ + 35 + ], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "C" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "C", + "refId": "D", + "type": "threshold" + } + } + ], + "noDataState": "OK", + "execErrState": "Alerting", + "for": "5m", + "frequency": "5m", + "annotations": { + "__dashboardUid__": "home", + "__panelId__": "10", + "description": "95 percentile of work item waiting times is over 35 minutes. Test queues only." + }, + "labels": { + "NotificationId": "work-items-waiting-time-test-queues" + }, + "__dashboardUid__": "home", + "__panelId__": "10", + "folderUID": "arcade-services", + "ruleGroup": "Helix Queue Alerts", + "intervalMs": 300000, + "isPaused": false, + "notification_settings": { + "receiver": ".NET Status Alert", + "group_wait": "5m", + "repeat_interval": "4h" } - ], - "noDataState": "OK", - "execErrState": "Alerting", - "for": "5m", - "frequency": "5m", - "annotations": { - "description": "95 percentile of work item waiting times is over 35 minutes. Test queues only." - }, - "labels": { - "NotificationId": "work-items-waiting-time-test-queues" - }, - "folderUID": "arcade-services", - "ruleGroup": "Helix Queue Alerts", - "intervalMs": 300000, - "isPaused": false, - "notification_settings": { - "receiver": ".NET Status Alert", - "group_wait": "5m", - "repeat_interval": "4h" - } -} +} \ No newline at end of file diff --git a/src/Monitoring/Monitoring.ArcadeServices/dashboard/arcade-services/arcadeAvailability.dashboard.json b/src/Monitoring/Monitoring.ArcadeServices/dashboard/arcade-services/arcadeAvailability.dashboard.json index 68d1a432e..f4e926108 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/dashboard/arcade-services/arcadeAvailability.dashboard.json +++ b/src/Monitoring/Monitoring.ArcadeServices/dashboard/arcade-services/arcadeAvailability.dashboard.json @@ -17,15 +17,28 @@ "type": "dashboard" }, { - "datasource": "Deployment Annotations", - "enable": false, + "datasource": { + "type": "yesoreyeram-infinity-datasource", + "uid": "deployment-annotations-infinity" + }, + "enable": true, + "hide": false, "iconColor": "blue", "name": "Deployments", "target": { - "limit": 100, - "matchAny": false, - "tags": [], - "type": "dashboard" + "columns": [], + "filters": [], + "type": "json", + "source": "url", + "url": "/api/annotations/grafana?from=${__from:date:iso}&to=${__to:date:iso}", + "url_options": { + "data": "", + "method": "GET" + }, + "format": "dataframe", + "format_version": "1.0", + "parser": "backend", + "root_selector": "" } } ] @@ -130,7 +143,6 @@ "showHeader": true, "sortBy": [] }, - "pluginVersion": "8.3.6", "targets": [ { "azureLogAnalytics": { @@ -2199,7 +2211,7 @@ "x": 12, "y": 57 }, - "id": 66, + "id": 75, "options": { "legend": { "calcs": [], @@ -2520,12 +2532,11 @@ "showOptions": "current", "sortOrder": 1, "stateFilter": { - "alerting": false, - "execution_error": false, - "no_data": false, - "ok": false, - "paused": false, - "pending": false + "firing": true, + "noData": true, + "normal": true, + "error": true, + "pending": true }, "tags": [] }, @@ -3687,5 +3698,6 @@ "timepicker": {}, "timezone": "", "title": "Service Availability", + "uid": "arcadeAvailability", "weekStart": "" } \ No newline at end of file diff --git a/src/Monitoring/Monitoring.ArcadeServices/dashboard/arcade-services/quota.dashboard.json b/src/Monitoring/Monitoring.ArcadeServices/dashboard/arcade-services/quota.dashboard.json index 779a1ba7b..a8cb14d46 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/dashboard/arcade-services/quota.dashboard.json +++ b/src/Monitoring/Monitoring.ArcadeServices/dashboard/arcade-services/quota.dashboard.json @@ -17,15 +17,28 @@ "type": "dashboard" }, { - "datasource": "Deployment Annotations", + "datasource": { + "type": "yesoreyeram-infinity-datasource", + "uid": "deployment-annotations-infinity" + }, "enable": true, + "hide": false, "iconColor": "blue", "name": "Deployments", "target": { - "limit": 100, - "matchAny": false, - "tags": [], - "type": "dashboard" + "columns": [], + "filters": [], + "type": "json", + "source": "url", + "url": "/api/annotations/grafana?from=${__from:date:iso}&to=${__to:date:iso}", + "url_options": { + "data": "", + "method": "GET" + }, + "format": "dataframe", + "format_version": "1.0", + "parser": "backend", + "root_selector": "" } } ] @@ -70,12 +83,11 @@ "showOptions": "current", "sortOrder": 1, "stateFilter": { - "alerting": false, - "execution_error": false, - "no_data": false, - "ok": false, - "paused": false, - "pending": false + "firing": true, + "noData": true, + "normal": true, + "error": true, + "pending": true }, "tags": [] }, From af361446aa1a942251e64eded789eb3185e85432 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Wed, 26 Nov 2025 18:34:49 -0800 Subject: [PATCH 111/133] add grafana keyvault manifest file --- .vault-config/dnceng-amg-int-kv.yaml | 25 ++++++++++++++++++++----- .vault-config/dnceng-amg-prod-kv.yaml | 27 +++++++++++++++++++++------ 2 files changed, 41 insertions(+), 11 deletions(-) diff --git a/.vault-config/dnceng-amg-int-kv.yaml b/.vault-config/dnceng-amg-int-kv.yaml index 34cbf9b3b..c0ebb0381 100644 --- a/.vault-config/dnceng-amg-int-kv.yaml +++ b/.vault-config/dnceng-amg-int-kv.yaml @@ -4,8 +4,23 @@ storageLocation: subscription: a4fc5514-21a9-4296-bfaf-5c7ee7fa35d1 name: dnceng-amg-int-kv -import: - - shared/dotnet-grafana-secrets.yaml: - - dotnet-build-bot-dotnet-eng-status-token - - dotneteng-status-auth-header - - fr-bot-notifications-teams-notification-url +secrets: + # Copy only the secrets needed for Azure Managed Grafana datasources and notifications + + # API token for DotNet Status website + dotnet-build-bot-dotnet-eng-status-token: + type: text + parameters: + description: API token from https://dotneteng-status-staging.azurewebsites.net/ - Generated using dotnet-build-bot account + + # Authorization header for Deployment Annotations datasource + dotneteng-status-auth-header: + type: text + parameters: + description: "Bearer token for status API - Format: Bearer " + + # Teams webhook URL for alert notifications + fr-bot-notifications-teams-notification-url: + type: text + parameters: + description: Teams Incoming Webhook URL - Do not rotate \ No newline at end of file diff --git a/.vault-config/dnceng-amg-prod-kv.yaml b/.vault-config/dnceng-amg-prod-kv.yaml index 34cbf9b3b..c6effbac6 100644 --- a/.vault-config/dnceng-amg-prod-kv.yaml +++ b/.vault-config/dnceng-amg-prod-kv.yaml @@ -2,10 +2,25 @@ storageLocation: type: azure-key-vault parameters: subscription: a4fc5514-21a9-4296-bfaf-5c7ee7fa35d1 - name: dnceng-amg-int-kv + name: dnceng-amg-prod-kv -import: - - shared/dotnet-grafana-secrets.yaml: - - dotnet-build-bot-dotnet-eng-status-token - - dotneteng-status-auth-header - - fr-bot-notifications-teams-notification-url +secrets: + # Copy only the secrets needed for Azure Managed Grafana datasources and notifications + + # API token for DotNet Status website + dotnet-build-bot-dotnet-eng-status-token: + type: text + parameters: + description: API token from https://dotneteng-status.azurewebsites.net/ - Generated using dotnet-build-bot account + + # Authorization header for Deployment Annotations datasource + dotneteng-status-auth-header: + type: text + parameters: + description: "Bearer token for status API - Format: Bearer " + + # Teams webhook URL for alert notifications + fr-bot-notifications-teams-notification-url: + type: text + parameters: + description: Teams Incoming Webhook URL - Do not rotate From 83d0b66d08f263f4f0f96ba8df06c127c2df8900 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Wed, 26 Nov 2025 18:52:44 -0800 Subject: [PATCH 112/133] add grafana annotation settings for infinity datasource --- .../Controllers/AnnotationsController.cs | 88 +++++++++++++++++++ .../Models/GrafanaAnnotation.cs | 13 +++ .../Models/GrafanaAnnotationQuery.cs | 23 +++++ ...ent Annotations (Infinity).datasource.json | 16 ++++ ...ent Annotations (Infinity).datasource.json | 16 ++++ 5 files changed, 156 insertions(+) create mode 100644 src/DotNet.Status.Web/DotNet.Status.Web/Models/GrafanaAnnotation.cs create mode 100644 src/DotNet.Status.Web/DotNet.Status.Web/Models/GrafanaAnnotationQuery.cs create mode 100644 src/Monitoring/Monitoring.ArcadeServices/datasource/Production/Deployment Annotations (Infinity).datasource.json create mode 100644 src/Monitoring/Monitoring.ArcadeServices/datasource/Staging/Deployment Annotations (Infinity).datasource.json diff --git a/src/DotNet.Status.Web/DotNet.Status.Web/Controllers/AnnotationsController.cs b/src/DotNet.Status.Web/DotNet.Status.Web/Controllers/AnnotationsController.cs index 9cacfd777..b82500651 100644 --- a/src/DotNet.Status.Web/DotNet.Status.Web/Controllers/AnnotationsController.cs +++ b/src/DotNet.Status.Web/DotNet.Status.Web/Controllers/AnnotationsController.cs @@ -142,4 +142,92 @@ public async Task>> Post(AnnotationQue return annotationEntries; } + + /// + /// Native Grafana annotations endpoint. Returns annotations in the format expected by + /// Grafana's built-in annotation queries. + /// Supports both POST with body and GET with query parameters. + /// + [HttpPost] + [HttpGet] + [Route("grafana")] + public async Task>> GetGrafanaAnnotations( + [FromBody(EmptyBodyBehavior = Microsoft.AspNetCore.Mvc.ModelBinding.EmptyBodyBehavior.Allow)] GrafanaAnnotationQuery query, + [FromQuery] string from, + [FromQuery] string to, + CancellationToken cancellationToken) + { + DateTime fromDate, toDate; + + if (query?.Range != null) + { + // POST request with body + fromDate = query.Range.From; + toDate = query.Range.To; + } + else if (!string.IsNullOrEmpty(from) && !string.IsNullOrEmpty(to)) + { + // GET request with query parameters + if (!DateTime.TryParse(from, out fromDate) || !DateTime.TryParse(to, out toDate)) + { + return BadRequest("Invalid date format"); + } + } + else + { + return BadRequest("Missing date range"); + } + + IEnumerable services = (query?.Annotation?.Query?.Split(',') ?? Array.Empty()) + .Where(s => !string.IsNullOrWhiteSpace(s)) + .Select(s => s.Trim()); + + if (services.Count() > _maximumServerCount) + { + return new List(); + } + + StringBuilder filterBuilder = new StringBuilder(); + filterBuilder.Append($"Started gt datetime'{fromDate:O}' and Ended lt datetime'{toDate:O}'"); + if (services.Any()) + { + filterBuilder.Append(" and ("); + filterBuilder.Append(string.Join(" or ", services.Select(s => $"PartitionKey eq '{s}'"))); + filterBuilder.Append(')'); + } + + string filter = filterBuilder.ToString(); + _logger.LogTrace("Compiled Grafana annotation filter query: {Query}", filter); + + TableClient tableClient = await GetCloudTable(); + IAsyncEnumerable entityQuery = tableClient.QueryAsync( + filter: filter, + cancellationToken: cancellationToken); + + List annotations = new List(); + await foreach (DeploymentEntity entity in entityQuery) + { + if (entity.Started == null && entity.Ended == null) + { + continue; + } + + var annotation = new GrafanaAnnotation + { + Time = entity.Started?.ToUnixTimeMilliseconds() ?? entity.Ended.Value.ToUnixTimeMilliseconds(), + Title = $"Deployment of {entity.Service}", + Tags = new[] { "deployment", "deploy", $"deploy-{entity.Service}", entity.Service }, + Text = $"Service: {entity.Service}" + }; + + if (entity.Started != null && entity.Ended != null) + { + annotation.TimeEnd = entity.Ended.Value.ToUnixTimeMilliseconds(); + } + + annotations.Add(annotation); + } + + return annotations; + } } diff --git a/src/DotNet.Status.Web/DotNet.Status.Web/Models/GrafanaAnnotation.cs b/src/DotNet.Status.Web/DotNet.Status.Web/Models/GrafanaAnnotation.cs new file mode 100644 index 000000000..914987cf8 --- /dev/null +++ b/src/DotNet.Status.Web/DotNet.Status.Web/Models/GrafanaAnnotation.cs @@ -0,0 +1,13 @@ +namespace DotNet.Status.Web.Models; +public class GrafanaAnnotation +{ + public long Time { get; set; } + + public long? TimeEnd { get; set; } + + public string Title { get; set; } + + public string[] Tags { get; set; } + + public string Text { get; set; } +} diff --git a/src/DotNet.Status.Web/DotNet.Status.Web/Models/GrafanaAnnotationQuery.cs b/src/DotNet.Status.Web/DotNet.Status.Web/Models/GrafanaAnnotationQuery.cs new file mode 100644 index 000000000..49053924d --- /dev/null +++ b/src/DotNet.Status.Web/DotNet.Status.Web/Models/GrafanaAnnotationQuery.cs @@ -0,0 +1,23 @@ +using System; + +namespace DotNet.Status.Web.Models; +public class GrafanaAnnotationQuery +{ + public AnnotationQueryRange Range { get; set; } + public AnnotationDefinition Annotation { get; set; } +} + +public class AnnotationQueryRange +{ + public DateTime From { get; set; } + public DateTime To { get; set; } +} + +public class AnnotationDefinition +{ + public string Name { get; set; } + public string Datasource { get; set; } + public bool Enable { get; set; } + public string IconColor { get; set; } + public string Query { get; set; } +} diff --git a/src/Monitoring/Monitoring.ArcadeServices/datasource/Production/Deployment Annotations (Infinity).datasource.json b/src/Monitoring/Monitoring.ArcadeServices/datasource/Production/Deployment Annotations (Infinity).datasource.json new file mode 100644 index 000000000..2486df2a8 --- /dev/null +++ b/src/Monitoring/Monitoring.ArcadeServices/datasource/Production/Deployment Annotations (Infinity).datasource.json @@ -0,0 +1,16 @@ +{ + "uid": "deployment-annotations-infinity", + "name": "Deployment Annotations (Infinity)", + "type": "yesoreyeram-infinity-datasource", + "access": "proxy", + "url": "https://dotneteng-status.azurewebsites.net", + "jsonData": { + "tlsSkipVerify": false, + "httpHeaderName1": "Authorization" + }, + "secureJsonData": { + "httpHeaderValue1": "[vault(dotneteng-status-auth-header)]" + }, + "isDefault": false, + "readOnly": false +} diff --git a/src/Monitoring/Monitoring.ArcadeServices/datasource/Staging/Deployment Annotations (Infinity).datasource.json b/src/Monitoring/Monitoring.ArcadeServices/datasource/Staging/Deployment Annotations (Infinity).datasource.json new file mode 100644 index 000000000..37e92e155 --- /dev/null +++ b/src/Monitoring/Monitoring.ArcadeServices/datasource/Staging/Deployment Annotations (Infinity).datasource.json @@ -0,0 +1,16 @@ +{ + "uid": "deployment-annotations-infinity", + "name": "Deployment Annotations (Infinity)", + "type": "yesoreyeram-infinity-datasource", + "access": "proxy", + "url": "https://dotneteng-status-staging.azurewebsites.net", + "jsonData": { + "tlsSkipVerify": false, + "httpHeaderName1": "Authorization" + }, + "secureJsonData": { + "httpHeaderValue1": "[vault(dotneteng-status-auth-header)]" + }, + "isDefault": false, + "readOnly": false +} From 2ca58acfc96bbe80d9228cf52995847d89300747 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Sun, 30 Nov 2025 22:47:45 -0800 Subject: [PATCH 113/133] show inactive alerts --- .../dashboard/arcade-services/arcadeAvailability.dashboard.json | 1 + .../dashboard/arcade-services/quota.dashboard.json | 1 + 2 files changed, 2 insertions(+) diff --git a/src/Monitoring/Monitoring.ArcadeServices/dashboard/arcade-services/arcadeAvailability.dashboard.json b/src/Monitoring/Monitoring.ArcadeServices/dashboard/arcade-services/arcadeAvailability.dashboard.json index f4e926108..728548ed2 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/dashboard/arcade-services/arcadeAvailability.dashboard.json +++ b/src/Monitoring/Monitoring.ArcadeServices/dashboard/arcade-services/arcadeAvailability.dashboard.json @@ -2529,6 +2529,7 @@ "dashboardAlerts": true, "dashboardTitle": "", "maxItems": 10, + "showInactiveAlerts": true, "showOptions": "current", "sortOrder": 1, "stateFilter": { diff --git a/src/Monitoring/Monitoring.ArcadeServices/dashboard/arcade-services/quota.dashboard.json b/src/Monitoring/Monitoring.ArcadeServices/dashboard/arcade-services/quota.dashboard.json index a8cb14d46..244cf0c4c 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/dashboard/arcade-services/quota.dashboard.json +++ b/src/Monitoring/Monitoring.ArcadeServices/dashboard/arcade-services/quota.dashboard.json @@ -80,6 +80,7 @@ "dashboardAlerts": true, "dashboardTitle": "", "maxItems": 10, + "showInactiveAlerts": true, "showOptions": "current", "sortOrder": 1, "stateFilter": { From 7d0a9cd4d2b9872c2c37673268ae8b7b61a8f34e Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Wed, 3 Dec 2025 15:36:43 -0800 Subject: [PATCH 114/133] set alert rule timeframe --- .../Production/cores-consumption.alert.json | 2 +- ...otneteng-status-failed-requests.alert.json | 23 ++----------------- .../helix-api-availability.alert.json | 21 +---------------- ...helix-api-average-response-time.alert.json | 21 +---------------- ...elix-autoscaler-service-stopped.alert.json | 21 +---------------- .../pcs-background-worker-stopped.alert.json | 23 ++----------------- ...ontainer-job-execution-failures.alert.json | 21 +---------------- .../pcs-disk-space-issues.alert.json | 21 +---------------- .../Production/pcs-exceptions-high.alert.json | 21 +---------------- .../pcs-git-push-success-rate.alert.json | 21 +---------------- .../pcs-work-item-success-rate.alert.json | 23 ++----------------- .../Production/quota-eastus.alert.json | 21 +---------------- .../Production/quota-westus.alert.json | 2 +- .../Production/quota-westus2.alert.json | 2 +- .../source-dot-net-availability.alert.json | 21 +---------------- ...-items-waiting-time-build-pools.alert.json | 23 ++----------------- ...-items-waiting-time-test-queues.alert.json | 23 ++----------------- .../Staging/cores-consumption.alert.json | 2 +- ...otneteng-status-failed-requests.alert.json | 23 ++----------------- .../Staging/helix-api-availability.alert.json | 21 +---------------- ...helix-api-average-response-time.alert.json | 21 +---------------- ...elix-autoscaler-service-stopped.alert.json | 21 +---------------- .../pcs-background-worker-stopped.alert.json | 23 ++----------------- ...ontainer-job-execution-failures.alert.json | 21 +---------------- .../Staging/pcs-disk-space-issues.alert.json | 21 +---------------- .../Staging/pcs-exceptions-high.alert.json | 21 +---------------- .../pcs-git-push-success-rate.alert.json | 21 +---------------- .../pcs-work-item-success-rate.alert.json | 23 ++----------------- .../Staging/quota-eastus.alert.json | 21 +---------------- .../Staging/quota-westus.alert.json | 2 +- .../Staging/quota-westus2.alert.json | 2 +- .../source-dot-net-availability.alert.json | 21 +---------------- ...-items-waiting-time-build-pools.alert.json | 23 ++----------------- ...-items-waiting-time-test-queues.alert.json | 23 ++----------------- .../dashboard/general/home.dashboard.json | 4 ++-- .../Monitoring.ArcadeServices/parameters.json | 16 ++++++++++++- 36 files changed, 61 insertions(+), 579 deletions(-) diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/cores-consumption.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/cores-consumption.alert.json index 72d8d8783..57705e756 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/cores-consumption.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/cores-consumption.alert.json @@ -8,7 +8,7 @@ "queryType": "Azure Log Analytics", "datasourceUid": "F2XodEi7z", "relativeTimeRange": { - "from": 21600, + "from": 300, "to": 0 }, "model": { diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/dotneteng-status-failed-requests.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/dotneteng-status-failed-requests.alert.json index 1cf07a889..0ec5bb980 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/dotneteng-status-failed-requests.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/dotneteng-status-failed-requests.alert.json @@ -36,8 +36,8 @@ ] }, "relativeTimeRange": { - "from": 86400, - "to": 0 + "from": 3600, + "to": 600 } }, { @@ -45,25 +45,6 @@ "queryType": "", "datasourceUid": "-100", "model": { - "conditions": [ - { - "evaluator": { - "params": [], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "query": { - "params": [ - "A", - "1h", - "now-10m" - ] - }, - "type": "query" - } - ], "datasource": { "type": "__expr__", "uid": "-100" diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/helix-api-availability.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/helix-api-availability.alert.json index 266e38272..7923c81d5 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/helix-api-availability.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/helix-api-availability.alert.json @@ -8,7 +8,7 @@ "queryType": "Azure Monitor", "datasourceUid": "F2XodEi7z", "relativeTimeRange": { - "from": 86400, + "from": 1800, "to": 0 }, "model": { @@ -70,25 +70,6 @@ "queryType": "", "datasourceUid": "-100", "model": { - "conditions": [ - { - "evaluator": { - "params": [], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "query": { - "params": [ - "A", - "30m", - "now" - ] - }, - "type": "query" - } - ], "datasource": { "type": "__expr__", "uid": "-100" diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/helix-api-average-response-time.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/helix-api-average-response-time.alert.json index 12b8766ac..c70a2adcb 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/helix-api-average-response-time.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/helix-api-average-response-time.alert.json @@ -8,7 +8,7 @@ "queryType": "Azure Monitor", "datasourceUid": "F2XodEi7z", "relativeTimeRange": { - "from": 86400, + "from": 300, "to": 0 }, "model": { @@ -59,25 +59,6 @@ "queryType": "", "datasourceUid": "-100", "model": { - "conditions": [ - { - "evaluator": { - "params": [], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "query": { - "params": [ - "A", - "5m", - "now" - ] - }, - "type": "query" - } - ], "datasource": { "type": "__expr__", "uid": "-100" diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/helix-autoscaler-service-stopped.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/helix-autoscaler-service-stopped.alert.json index ee7b4c4d3..0ed88673b 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/helix-autoscaler-service-stopped.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/helix-autoscaler-service-stopped.alert.json @@ -8,7 +8,7 @@ "queryType": "Azure Monitor", "datasourceUid": "F2XodEi7z", "relativeTimeRange": { - "from": 86400, + "from": 1800, "to": 0 }, "model": { @@ -71,25 +71,6 @@ "to": 0 }, "model": { - "conditions": [ - { - "evaluator": { - "params": [], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "query": { - "params": [ - "A", - "30m", - "now" - ] - }, - "type": "query" - } - ], "datasource": { "type": "__expr__", "uid": "-100" diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-background-worker-stopped.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-background-worker-stopped.alert.json index ea948a483..03514f745 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-background-worker-stopped.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-background-worker-stopped.alert.json @@ -8,7 +8,7 @@ "queryType": "Azure Log Analytics", "datasourceUid": "F2XodEi7z", "relativeTimeRange": { - "from": 86400, + "from": 300, "to": 0 }, "model": { @@ -39,7 +39,7 @@ "queryType": "Azure Log Analytics", "datasourceUid": "F2XodEi7z", "relativeTimeRange": { - "from": 86400, + "from": 300, "to": 0 }, "model": { @@ -75,25 +75,6 @@ "to": 0 }, "model": { - "conditions": [ - { - "evaluator": { - "params": [], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "query": { - "params": [ - "B", - "5m", - "now" - ] - }, - "type": "query" - } - ], "datasource": { "type": "__expr__", "uid": "-100" diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-container-job-execution-failures.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-container-job-execution-failures.alert.json index aa979ab79..8c6054e8a 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-container-job-execution-failures.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-container-job-execution-failures.alert.json @@ -8,7 +8,7 @@ "queryType": "Azure Log Analytics", "datasourceUid": "F2XodEi7z", "relativeTimeRange": { - "from": 86400, + "from": 300, "to": 0 }, "model": { @@ -54,25 +54,6 @@ "queryType": "", "datasourceUid": "-100", "model": { - "conditions": [ - { - "evaluator": { - "params": [], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "query": { - "params": [ - "A", - "5m", - "now" - ] - }, - "type": "query" - } - ], "datasource": { "type": "__expr__", "uid": "-100" diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-disk-space-issues.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-disk-space-issues.alert.json index fc8ac0dea..aa886e7fd 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-disk-space-issues.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-disk-space-issues.alert.json @@ -8,7 +8,7 @@ "queryType": "Azure Log Analytics", "datasourceUid": "F2XodEi7z", "relativeTimeRange": { - "from": 86400, + "from": 300, "to": 0 }, "model": { @@ -43,25 +43,6 @@ "queryType": "", "datasourceUid": "-100", "model": { - "conditions": [ - { - "evaluator": { - "params": [], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "query": { - "params": [ - "A", - "5m", - "now" - ] - }, - "type": "query" - } - ], "datasource": { "type": "__expr__", "uid": "-100" diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-exceptions-high.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-exceptions-high.alert.json index bc6479b60..114fe14be 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-exceptions-high.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-exceptions-high.alert.json @@ -8,7 +8,7 @@ "queryType": "Azure Log Analytics", "datasourceUid": "F2XodEi7z", "relativeTimeRange": { - "from": 86400, + "from": 300, "to": 0 }, "model": { @@ -47,25 +47,6 @@ "to": 0 }, "model": { - "conditions": [ - { - "evaluator": { - "params": [], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "query": { - "params": [ - "A", - "5m", - "now" - ] - }, - "type": "query" - } - ], "datasource": { "type": "__expr__", "uid": "-100" diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-git-push-success-rate.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-git-push-success-rate.alert.json index 1d5c2f62e..25d6b096c 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-git-push-success-rate.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-git-push-success-rate.alert.json @@ -34,7 +34,7 @@ ] }, "relativeTimeRange": { - "from": 86400, + "from": 300, "to": 0 } }, @@ -43,25 +43,6 @@ "queryType": "", "datasourceUid": "-100", "model": { - "conditions": [ - { - "evaluator": { - "params": [], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "query": { - "params": [ - "A", - "5m", - "now" - ] - }, - "type": "query" - } - ], "datasource": { "type": "__expr__", "uid": "-100" diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-work-item-success-rate.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-work-item-success-rate.alert.json index 61290ac69..a059858b8 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-work-item-success-rate.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-work-item-success-rate.alert.json @@ -8,7 +8,7 @@ "queryType": "Azure Log Analytics", "datasourceUid": "F2XodEi7z", "relativeTimeRange": { - "from": 86400, + "from": 300, "to": 0 }, "model": { @@ -70,7 +70,7 @@ ] }, "relativeTimeRange": { - "from": 86400, + "from": 300, "to": 0 } }, @@ -83,25 +83,6 @@ "to": 0 }, "model": { - "conditions": [ - { - "evaluator": { - "params": [], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "query": { - "params": [ - "B", - "5m", - "now" - ] - }, - "type": "query" - } - ], "datasource": { "type": "__expr__", "uid": "-100" diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/quota-eastus.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/quota-eastus.alert.json index ebdc0ffd0..8e0db517e 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/quota-eastus.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/quota-eastus.alert.json @@ -33,7 +33,7 @@ ] }, "relativeTimeRange": { - "from": 86400, + "from": 300, "to": 0 } }, @@ -42,25 +42,6 @@ "queryType": "", "datasourceUid": "-100", "model": { - "conditions": [ - { - "evaluator": { - "params": [], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "query": { - "params": [ - "A", - "5m", - "now" - ] - }, - "type": "query" - } - ], "datasource": { "type": "__expr__", "uid": "-100" diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/quota-westus.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/quota-westus.alert.json index beea05ea9..cba370182 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/quota-westus.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/quota-westus.alert.json @@ -33,7 +33,7 @@ ] }, "relativeTimeRange": { - "from": 86400, + "from": 300, "to": 0 } }, diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/quota-westus2.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/quota-westus2.alert.json index 6204af829..cc70acc06 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/quota-westus2.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/quota-westus2.alert.json @@ -33,7 +33,7 @@ ] }, "relativeTimeRange": { - "from": 86400, + "from": 600, "to": 0 } }, diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/source-dot-net-availability.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/source-dot-net-availability.alert.json index 205291f2b..e56a3241c 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/source-dot-net-availability.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/source-dot-net-availability.alert.json @@ -8,7 +8,7 @@ "queryType": "Azure Monitor", "datasourceUid": "F2XodEi7z", "relativeTimeRange": { - "from": 86400, + "from": 300, "to": 0 }, "model": { @@ -64,25 +64,6 @@ "queryType": "", "datasourceUid": "-100", "model": { - "conditions": [ - { - "evaluator": { - "params": [], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "query": { - "params": [ - "A", - "5m", - "now" - ] - }, - "type": "query" - } - ], "datasource": { "type": "__expr__", "uid": "-100" diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/work-items-waiting-time-build-pools.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/work-items-waiting-time-build-pools.alert.json index 8a952b6ae..a4d269995 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/work-items-waiting-time-build-pools.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/work-items-waiting-time-build-pools.alert.json @@ -8,7 +8,7 @@ "queryType": "", "datasourceUid": "OlcfOPi7z", "relativeTimeRange": { - "from": 86400, + "from": 300, "to": 0 }, "model": { @@ -43,7 +43,7 @@ "queryType": "", "datasourceUid": "OlcfOPi7z", "relativeTimeRange": { - "from": 86400, + "from": 300, "to": 0 }, "model": { @@ -78,25 +78,6 @@ "queryType": "", "datasourceUid": "-100", "model": { - "conditions": [ - { - "evaluator": { - "params": [], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "query": { - "params": [ - "A", - "5m", - "now" - ] - }, - "type": "query" - } - ], "datasource": { "type": "__expr__", "uid": "-100" diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/work-items-waiting-time-test-queues.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/work-items-waiting-time-test-queues.alert.json index bc9c62280..f8d4292b3 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/work-items-waiting-time-test-queues.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/work-items-waiting-time-test-queues.alert.json @@ -8,7 +8,7 @@ "queryType": "", "datasourceUid": "OlcfOPi7z", "relativeTimeRange": { - "from": 86400, + "from": 300, "to": 0 }, "model": { @@ -43,7 +43,7 @@ "queryType": "", "datasourceUid": "OlcfOPi7z", "relativeTimeRange": { - "from": 86400, + "from": 300, "to": 0 }, "model": { @@ -78,25 +78,6 @@ "queryType": "", "datasourceUid": "-100", "model": { - "conditions": [ - { - "evaluator": { - "params": [], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "query": { - "params": [ - "A", - "5m", - "now" - ] - }, - "type": "query" - } - ], "datasource": { "type": "__expr__", "uid": "-100" diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/cores-consumption.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/cores-consumption.alert.json index 72d8d8783..57705e756 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/cores-consumption.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/cores-consumption.alert.json @@ -8,7 +8,7 @@ "queryType": "Azure Log Analytics", "datasourceUid": "F2XodEi7z", "relativeTimeRange": { - "from": 21600, + "from": 300, "to": 0 }, "model": { diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/dotneteng-status-failed-requests.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/dotneteng-status-failed-requests.alert.json index 1cf07a889..0ec5bb980 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/dotneteng-status-failed-requests.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/dotneteng-status-failed-requests.alert.json @@ -36,8 +36,8 @@ ] }, "relativeTimeRange": { - "from": 86400, - "to": 0 + "from": 3600, + "to": 600 } }, { @@ -45,25 +45,6 @@ "queryType": "", "datasourceUid": "-100", "model": { - "conditions": [ - { - "evaluator": { - "params": [], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "query": { - "params": [ - "A", - "1h", - "now-10m" - ] - }, - "type": "query" - } - ], "datasource": { "type": "__expr__", "uid": "-100" diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/helix-api-availability.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/helix-api-availability.alert.json index 266e38272..7923c81d5 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/helix-api-availability.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/helix-api-availability.alert.json @@ -8,7 +8,7 @@ "queryType": "Azure Monitor", "datasourceUid": "F2XodEi7z", "relativeTimeRange": { - "from": 86400, + "from": 1800, "to": 0 }, "model": { @@ -70,25 +70,6 @@ "queryType": "", "datasourceUid": "-100", "model": { - "conditions": [ - { - "evaluator": { - "params": [], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "query": { - "params": [ - "A", - "30m", - "now" - ] - }, - "type": "query" - } - ], "datasource": { "type": "__expr__", "uid": "-100" diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/helix-api-average-response-time.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/helix-api-average-response-time.alert.json index 5501e3377..01189871e 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/helix-api-average-response-time.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/helix-api-average-response-time.alert.json @@ -8,7 +8,7 @@ "queryType": "Azure Monitor", "datasourceUid": "F2XodEi7z", "relativeTimeRange": { - "from": 86400, + "from": 300, "to": 0 }, "model": { @@ -59,25 +59,6 @@ "queryType": "", "datasourceUid": "-100", "model": { - "conditions": [ - { - "evaluator": { - "params": [], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "query": { - "params": [ - "A", - "5m", - "now" - ] - }, - "type": "query" - } - ], "datasource": { "type": "__expr__", "uid": "-100" diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/helix-autoscaler-service-stopped.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/helix-autoscaler-service-stopped.alert.json index ee7b4c4d3..0ed88673b 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/helix-autoscaler-service-stopped.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/helix-autoscaler-service-stopped.alert.json @@ -8,7 +8,7 @@ "queryType": "Azure Monitor", "datasourceUid": "F2XodEi7z", "relativeTimeRange": { - "from": 86400, + "from": 1800, "to": 0 }, "model": { @@ -71,25 +71,6 @@ "to": 0 }, "model": { - "conditions": [ - { - "evaluator": { - "params": [], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "query": { - "params": [ - "A", - "30m", - "now" - ] - }, - "type": "query" - } - ], "datasource": { "type": "__expr__", "uid": "-100" diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-background-worker-stopped.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-background-worker-stopped.alert.json index 718136d73..847055a55 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-background-worker-stopped.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-background-worker-stopped.alert.json @@ -8,7 +8,7 @@ "queryType": "Azure Log Analytics", "datasourceUid": "F2XodEi7z", "relativeTimeRange": { - "from": 86400, + "from": 300, "to": 0 }, "model": { @@ -39,7 +39,7 @@ "queryType": "Azure Log Analytics", "datasourceUid": "F2XodEi7z", "relativeTimeRange": { - "from": 86400, + "from": 300, "to": 0 }, "model": { @@ -75,25 +75,6 @@ "to": 0 }, "model": { - "conditions": [ - { - "evaluator": { - "params": [], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "query": { - "params": [ - "B", - "5m", - "now" - ] - }, - "type": "query" - } - ], "datasource": { "type": "__expr__", "uid": "-100" diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-container-job-execution-failures.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-container-job-execution-failures.alert.json index aa979ab79..8c6054e8a 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-container-job-execution-failures.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-container-job-execution-failures.alert.json @@ -8,7 +8,7 @@ "queryType": "Azure Log Analytics", "datasourceUid": "F2XodEi7z", "relativeTimeRange": { - "from": 86400, + "from": 300, "to": 0 }, "model": { @@ -54,25 +54,6 @@ "queryType": "", "datasourceUid": "-100", "model": { - "conditions": [ - { - "evaluator": { - "params": [], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "query": { - "params": [ - "A", - "5m", - "now" - ] - }, - "type": "query" - } - ], "datasource": { "type": "__expr__", "uid": "-100" diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-disk-space-issues.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-disk-space-issues.alert.json index fc8ac0dea..aa886e7fd 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-disk-space-issues.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-disk-space-issues.alert.json @@ -8,7 +8,7 @@ "queryType": "Azure Log Analytics", "datasourceUid": "F2XodEi7z", "relativeTimeRange": { - "from": 86400, + "from": 300, "to": 0 }, "model": { @@ -43,25 +43,6 @@ "queryType": "", "datasourceUid": "-100", "model": { - "conditions": [ - { - "evaluator": { - "params": [], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "query": { - "params": [ - "A", - "5m", - "now" - ] - }, - "type": "query" - } - ], "datasource": { "type": "__expr__", "uid": "-100" diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-exceptions-high.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-exceptions-high.alert.json index cb6ecb002..0f6ee8e3f 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-exceptions-high.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-exceptions-high.alert.json @@ -8,7 +8,7 @@ "queryType": "Azure Log Analytics", "datasourceUid": "F2XodEi7z", "relativeTimeRange": { - "from": 86400, + "from": 300, "to": 0 }, "model": { @@ -47,25 +47,6 @@ "to": 0 }, "model": { - "conditions": [ - { - "evaluator": { - "params": [], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "query": { - "params": [ - "A", - "5m", - "now" - ] - }, - "type": "query" - } - ], "datasource": { "type": "__expr__", "uid": "-100" diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-git-push-success-rate.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-git-push-success-rate.alert.json index 603ee2aa6..d8a8cbb3c 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-git-push-success-rate.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-git-push-success-rate.alert.json @@ -34,7 +34,7 @@ ] }, "relativeTimeRange": { - "from": 86400, + "from": 300, "to": 0 } }, @@ -43,25 +43,6 @@ "queryType": "", "datasourceUid": "-100", "model": { - "conditions": [ - { - "evaluator": { - "params": [], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "query": { - "params": [ - "A", - "5m", - "now" - ] - }, - "type": "query" - } - ], "datasource": { "type": "__expr__", "uid": "-100" diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-work-item-success-rate.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-work-item-success-rate.alert.json index 61290ac69..a059858b8 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-work-item-success-rate.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-work-item-success-rate.alert.json @@ -8,7 +8,7 @@ "queryType": "Azure Log Analytics", "datasourceUid": "F2XodEi7z", "relativeTimeRange": { - "from": 86400, + "from": 300, "to": 0 }, "model": { @@ -70,7 +70,7 @@ ] }, "relativeTimeRange": { - "from": 86400, + "from": 300, "to": 0 } }, @@ -83,25 +83,6 @@ "to": 0 }, "model": { - "conditions": [ - { - "evaluator": { - "params": [], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "query": { - "params": [ - "B", - "5m", - "now" - ] - }, - "type": "query" - } - ], "datasource": { "type": "__expr__", "uid": "-100" diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/quota-eastus.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/quota-eastus.alert.json index ebdc0ffd0..8e0db517e 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/quota-eastus.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/quota-eastus.alert.json @@ -33,7 +33,7 @@ ] }, "relativeTimeRange": { - "from": 86400, + "from": 300, "to": 0 } }, @@ -42,25 +42,6 @@ "queryType": "", "datasourceUid": "-100", "model": { - "conditions": [ - { - "evaluator": { - "params": [], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "query": { - "params": [ - "A", - "5m", - "now" - ] - }, - "type": "query" - } - ], "datasource": { "type": "__expr__", "uid": "-100" diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/quota-westus.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/quota-westus.alert.json index beea05ea9..cba370182 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/quota-westus.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/quota-westus.alert.json @@ -33,7 +33,7 @@ ] }, "relativeTimeRange": { - "from": 86400, + "from": 300, "to": 0 } }, diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/quota-westus2.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/quota-westus2.alert.json index 6204af829..cc70acc06 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/quota-westus2.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/quota-westus2.alert.json @@ -33,7 +33,7 @@ ] }, "relativeTimeRange": { - "from": 86400, + "from": 600, "to": 0 } }, diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/source-dot-net-availability.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/source-dot-net-availability.alert.json index 205291f2b..e56a3241c 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/source-dot-net-availability.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/source-dot-net-availability.alert.json @@ -8,7 +8,7 @@ "queryType": "Azure Monitor", "datasourceUid": "F2XodEi7z", "relativeTimeRange": { - "from": 86400, + "from": 300, "to": 0 }, "model": { @@ -64,25 +64,6 @@ "queryType": "", "datasourceUid": "-100", "model": { - "conditions": [ - { - "evaluator": { - "params": [], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "query": { - "params": [ - "A", - "5m", - "now" - ] - }, - "type": "query" - } - ], "datasource": { "type": "__expr__", "uid": "-100" diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/work-items-waiting-time-build-pools.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/work-items-waiting-time-build-pools.alert.json index 8a952b6ae..a4d269995 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/work-items-waiting-time-build-pools.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/work-items-waiting-time-build-pools.alert.json @@ -8,7 +8,7 @@ "queryType": "", "datasourceUid": "OlcfOPi7z", "relativeTimeRange": { - "from": 86400, + "from": 300, "to": 0 }, "model": { @@ -43,7 +43,7 @@ "queryType": "", "datasourceUid": "OlcfOPi7z", "relativeTimeRange": { - "from": 86400, + "from": 300, "to": 0 }, "model": { @@ -78,25 +78,6 @@ "queryType": "", "datasourceUid": "-100", "model": { - "conditions": [ - { - "evaluator": { - "params": [], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "query": { - "params": [ - "A", - "5m", - "now" - ] - }, - "type": "query" - } - ], "datasource": { "type": "__expr__", "uid": "-100" diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/work-items-waiting-time-test-queues.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/work-items-waiting-time-test-queues.alert.json index f958904f2..aa8895e42 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/work-items-waiting-time-test-queues.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/work-items-waiting-time-test-queues.alert.json @@ -8,7 +8,7 @@ "queryType": "", "datasourceUid": "OlcfOPi7z", "relativeTimeRange": { - "from": 86400, + "from": 300, "to": 0 }, "model": { @@ -43,7 +43,7 @@ "queryType": "", "datasourceUid": "OlcfOPi7z", "relativeTimeRange": { - "from": 86400, + "from": 300, "to": 0 }, "model": { @@ -78,25 +78,6 @@ "queryType": "", "datasourceUid": "-100", "model": { - "conditions": [ - { - "evaluator": { - "params": [], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "query": { - "params": [ - "A", - "5m", - "now" - ] - }, - "type": "query" - } - ], "datasource": { "type": "__expr__", "uid": "-100" diff --git a/src/Monitoring/Monitoring.ArcadeServices/dashboard/general/home.dashboard.json b/src/Monitoring/Monitoring.ArcadeServices/dashboard/general/home.dashboard.json index a0c0055f3..ab2671427 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/dashboard/general/home.dashboard.json +++ b/src/Monitoring/Monitoring.ArcadeServices/dashboard/general/home.dashboard.json @@ -48,7 +48,7 @@ }, "id": 6, "options": { - "folderId": 46, + "folderId": "[parameter(arcade-services-folderid)]", "maxItems": 10, "query": "", "showHeadings": false, @@ -69,7 +69,7 @@ }, "id": 7, "options": { - "folderId": 92, + "folderId": "[parameter(helix-service-folderid)]", "maxItems": 10, "query": "", "showHeadings": false, diff --git a/src/Monitoring/Monitoring.ArcadeServices/parameters.json b/src/Monitoring/Monitoring.ArcadeServices/parameters.json index 78079fcc0..7e8541b4b 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/parameters.json +++ b/src/Monitoring/Monitoring.ArcadeServices/parameters.json @@ -161,7 +161,7 @@ } }, { - "Name" : "product-construction-service-workspace-resourcename", + "Name": "product-construction-service-workspace-resourcename", "Values": { "Staging": "product-construction-service-workspace-int", "Production": "product-construction-service-workspace-prod" @@ -180,5 +180,19 @@ "Staging": "home", "Production": "home" } + }, + { + "Name": "arcade-services-folderid", + "Values": { + "Staging": "46", + "Production": "37" + } + }, + { + "Name": "helix-service-folderid", + "Values": { + "Staging": "92", + "Production": "41" + } } ] \ No newline at end of file From fd13b749a663092819d1b803187dd0cc55f46de8 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Tue, 6 Jan 2026 10:26:25 -0800 Subject: [PATCH 115/133] fix alerting rules folder directory --- .../Production/cores-consumption.alert.json | 2 +- .../arcadeAvailability.dashboard.json | 10 ++++++---- src/Monitoring/Sdk/DeployPublisher.cs | 17 ++++++++++++++++- src/Monitoring/Sdk/GrafanaSerialization.cs | 12 +++++++++++- 4 files changed, 34 insertions(+), 7 deletions(-) diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/cores-consumption.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/cores-consumption.alert.json index 57705e756..d548c5370 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/cores-consumption.alert.json +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/cores-consumption.alert.json @@ -109,7 +109,7 @@ "folderUID": "arcade-services", "ruleGroup": "Azure Quota Alerts", "intervalMs": 900000, - "isPaused": false, + "isPaused": true, "notification_settings": { "receiver": ".NET Status Alert", "group_wait": "5m", diff --git a/src/Monitoring/Monitoring.ArcadeServices/dashboard/arcade-services/arcadeAvailability.dashboard.json b/src/Monitoring/Monitoring.ArcadeServices/dashboard/arcade-services/arcadeAvailability.dashboard.json index 728548ed2..c6503e74e 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/dashboard/arcade-services/arcadeAvailability.dashboard.json +++ b/src/Monitoring/Monitoring.ArcadeServices/dashboard/arcade-services/arcadeAvailability.dashboard.json @@ -3335,10 +3335,12 @@ }, "query": { "params": [ - "A", - "1h", - "now-10m" - ] + "A" + ], + "relativeTimeRange": { + "from": 3600, + "to": 600 + } }, "reducer": { "params": [], diff --git a/src/Monitoring/Sdk/DeployPublisher.cs b/src/Monitoring/Sdk/DeployPublisher.cs index 891419108..e2d8b31b2 100644 --- a/src/Monitoring/Sdk/DeployPublisher.cs +++ b/src/Monitoring/Sdk/DeployPublisher.cs @@ -52,7 +52,22 @@ public DeployPublisher( private string EnvironmentDatasourceDirectory => Path.Combine(DatasourceDirectory, _environment); private string EnvironmentNotificationDirectory => Path.Combine(NotificationDirectory, _environment); - private string AlertRuleDirectory => Path.Combine(Path.GetDirectoryName(NotificationDirectory), "alertrules", _environment); + private string AlertRuleDirectory + { + get + { + string baseDir = Path.Combine(Path.GetDirectoryName(NotificationDirectory), "alertrules"); + string environmentSpecificDir = Path.Combine(baseDir, _environment); + + // If environment-specific folder exists, use it; otherwise fall back to base directory + if (Directory.Exists(environmentSpecificDir)) + { + return environmentSpecificDir; + } + + return baseDir; + } + } public void Dispose() { diff --git a/src/Monitoring/Sdk/GrafanaSerialization.cs b/src/Monitoring/Sdk/GrafanaSerialization.cs index d74c992e6..1f095c61d 100644 --- a/src/Monitoring/Sdk/GrafanaSerialization.cs +++ b/src/Monitoring/Sdk/GrafanaSerialization.cs @@ -211,7 +211,17 @@ public static JObject DeparameterizeDashboard(JObject dashboard, IEnumerable Date: Tue, 6 Jan 2026 10:58:39 -0800 Subject: [PATCH 116/133] refactor grafana publishing to use only one stage --- eng/deploy-managed-grafana.yml | 90 ++++++++++++++++++++++++++++------ 1 file changed, 76 insertions(+), 14 deletions(-) diff --git a/eng/deploy-managed-grafana.yml b/eng/deploy-managed-grafana.yml index 9d5099a0f..ec0afa726 100644 --- a/eng/deploy-managed-grafana.yml +++ b/eng/deploy-managed-grafana.yml @@ -13,11 +13,8 @@ parameters: type: string stages: -- stage: ProvisionGrafana - displayName: 'Provision Grafana Infrastructure' - # dependsOn: - # - predeploy - # - approval +- stage: DeployGrafana + displayName: 'Deploy Grafana Infrastructure and Dashboards' jobs: - template: /eng/provision-grafana.yaml@self parameters: @@ -28,13 +25,76 @@ stages: GrafanaLocation: 'westus2' GrafanaKeyVault: ${{ parameters.GrafanaKeyVault }} -- stage: PublishDashboards - displayName: 'Publish Grafana Dashboards' - dependsOn: ProvisionGrafana - variables: - GrafanaEndpoint: $[ stageDependencies.ProvisionGrafana.ProvisionGrafana.outputs['ExportGrafanaInfo.GrafanaEndpoint'] ] - jobs: - job: SetupToken + dependsOn: ProvisionGrafana + displayName: 'Setup Grafana API Token' + variables: + GrafanaEndpoint: $[ dependencies.ProvisionGrafana.outputs['ExportGrafanaInfo.GrafanaEndpoint'] ] + pool: + name: NetCore1ESPool-Internal + demands: ImageOverride -equals 1es-windows-2022 + steps: + - task: AzureCLI@2 + displayName: 'Grant Pipeline Service Principal Grafana Admin Role' + inputs: + azureSubscription: ${{ parameters.ServiceConnectionName }} + scriptType: 'pscore' + scriptLocation: 'inlineScript' + inlineScript: | + Write-Host "Granting pipeline service principal Grafana Admin role..." + + $workspaceName = "${{ parameters.GrafanaWorkspaceName }}" + $rgName = "monitoring-managed" + + # Get the current service principal object ID + $spObjectId = az account show --query "user.name" --output tsv + Write-Host "Service Principal Object ID: $spObjectId" + + # Get the Grafana workspace resource ID + $grafanaId = az grafana show --name $workspaceName --resource-group $rgName --query "id" --output tsv + Write-Host "Grafana Workspace: $workspaceName" + Write-Host "Grafana ID: $grafanaId" + + # Check if role assignment already exists + $existingAssignment = az role assignment list ` + --assignee $spObjectId ` + --scope $grafanaId ` + --role "Grafana Admin" ` + --query "[0].id" ` + --output tsv + + if ($existingAssignment) { + Write-Host "āœ“ Pipeline service principal already has Grafana Admin role" + } else { + Write-Host "Granting Grafana Admin role..." + az role assignment create ` + --role "Grafana Admin" ` + --assignee $spObjectId ` + --scope $grafanaId ` + --output none + + if ($LASTEXITCODE -eq 0) { + Write-Host "āœ“ Pipeline service principal granted Grafana Admin role" + Write-Host "ā± Waiting 15 seconds for role assignment to propagate..." + Start-Sleep -Seconds 15 + } else { + Write-Error "Failed to grant Grafana Admin role" + exit 1 + } + } + + - task: AzureCLI@2 + displayName: 'Create or Validate Grafana API Token' + inputs: + azureSubscription: ${{ parameters.ServiceConnectionName }} + scriptType: 'pscore' + scriptLocation: 'scriptPath' + scriptPath: 'eng/setup-grafana-api-token.ps1' + arguments: >- + -Environment "${{ parameters.DeploymentEnvironment }}" + -KeyVaultName "${{ parameters.GrafanaKeyVault }}" + + - job: PublishDashboards displayName: 'Setup Grafana API Token' pool: name: NetCore1ESPool-Internal @@ -102,13 +162,15 @@ stages: - job: PublishDashboards displayName: 'Publish Dashboards to Azure Managed Grafana' - dependsOn: SetupToken + dependsOn: + - ProvisionGrafana + - SetupToken pool: name: NetCore1ESPool-Internal demands: ImageOverride -equals 1es-windows-2022 variables: - - name: System.AccessToken - value: $(System.AccessToken) + GrafanaEndpoint: $[ dependencies.ProvisionGrafana.outputs['ExportGrafanaInfo.GrafanaEndpoint'] ] + System.AccessToken: $(System.AccessToken) steps: - task: UseDotNet@2 displayName: 'Install Correct .NET Version' From 3e027929d6dccde9386c231c048158c080094f88 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Tue, 6 Jan 2026 11:17:05 -0800 Subject: [PATCH 117/133] refactor grafana publishing to use only one stage --- eng/deploy-managed-grafana.yml | 66 ---------------------------------- 1 file changed, 66 deletions(-) diff --git a/eng/deploy-managed-grafana.yml b/eng/deploy-managed-grafana.yml index ec0afa726..97379318e 100644 --- a/eng/deploy-managed-grafana.yml +++ b/eng/deploy-managed-grafana.yml @@ -94,72 +94,6 @@ stages: -Environment "${{ parameters.DeploymentEnvironment }}" -KeyVaultName "${{ parameters.GrafanaKeyVault }}" - - job: PublishDashboards - displayName: 'Setup Grafana API Token' - pool: - name: NetCore1ESPool-Internal - demands: ImageOverride -equals 1es-windows-2022 - steps: - - task: AzureCLI@2 - displayName: 'Grant Pipeline Service Principal Grafana Admin Role' - inputs: - azureSubscription: ${{ parameters.ServiceConnectionName }} - scriptType: 'pscore' - scriptLocation: 'inlineScript' - inlineScript: | - Write-Host "Granting pipeline service principal Grafana Admin role..." - - $workspaceName = "${{ parameters.GrafanaWorkspaceName }}" - $rgName = "monitoring-managed" - - # Get the current service principal object ID - $spObjectId = az account show --query "user.name" --output tsv - Write-Host "Service Principal Object ID: $spObjectId" - - # Get the Grafana workspace resource ID - $grafanaId = az grafana show --name $workspaceName --resource-group $rgName --query "id" --output tsv - Write-Host "Grafana Workspace: $workspaceName" - Write-Host "Grafana ID: $grafanaId" - - # Check if role assignment already exists - $existingAssignment = az role assignment list ` - --assignee $spObjectId ` - --scope $grafanaId ` - --role "Grafana Admin" ` - --query "[0].id" ` - --output tsv - - if ($existingAssignment) { - Write-Host "āœ“ Pipeline service principal already has Grafana Admin role" - } else { - Write-Host "Granting Grafana Admin role..." - az role assignment create ` - --role "Grafana Admin" ` - --assignee $spObjectId ` - --scope $grafanaId ` - --output none - - if ($LASTEXITCODE -eq 0) { - Write-Host "āœ“ Pipeline service principal granted Grafana Admin role" - Write-Host "ā± Waiting 15 seconds for role assignment to propagate..." - Start-Sleep -Seconds 15 - } else { - Write-Error "Failed to grant Grafana Admin role" - exit 1 - } - } - - - task: AzureCLI@2 - displayName: 'Create or Validate Grafana API Token' - inputs: - azureSubscription: ${{ parameters.ServiceConnectionName }} - scriptType: 'pscore' - scriptLocation: 'scriptPath' - scriptPath: 'eng/setup-grafana-api-token.ps1' - arguments: >- - -Environment "${{ parameters.DeploymentEnvironment }}" - -KeyVaultName "${{ parameters.GrafanaKeyVault }}" - - job: PublishDashboards displayName: 'Publish Dashboards to Azure Managed Grafana' dependsOn: From 1b3005efd45e8d06fabff50ba1ab91ad0357e3d5 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Tue, 6 Jan 2026 16:22:16 -0800 Subject: [PATCH 118/133] add managed grafana to the dotnet-dnceng-ci pipeline --- azure-pipelines.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 21f328f83..cbd977387 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -272,13 +272,17 @@ extends: ServiceConnectionClientId: fc1eb341-aea4-4a11-8f80-d14b8775b2ba ServiceConnectionId: 4a511f6f-b538-48e6-a389-207e430634d1 + ${{ if in(variables['Build.SourceBranch'], 'refs/heads/haruna/managed-grafana-new', 'refs/heads/production')}}: - template: /eng/deploy-managed-grafana.yml@self parameters: ServiceConnectionName: 'Dotnet Engineering services' ServiceConnectionId: dd8c2cfc-b9c9-452c-a168-ccd4240ada55 + ServiceConnectionClientId: fc1eb341-aea4-4a11-8f80-d14b8775b2ba ${{ if ne(variables['Build.SourceBranch'], 'refs/heads/production') }}: DeploymentEnvironment: Staging GrafanaWorkspaceName: dnceng-grafana-staging + GrafanaKeyVault: dnceng-amg-int-kv ${{ else }}: DeploymentEnvironment: Production - GrafanaWorkspaceName: dnceng-grafana \ No newline at end of file + GrafanaWorkspaceName: dnceng-grafana + GrafanaKeyVault: dnceng-amg-prod-kv \ No newline at end of file From 44bc88a38a221e5fd6aea34ed6bf7b5dc3a50e9b Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Tue, 6 Jan 2026 16:31:03 -0800 Subject: [PATCH 119/133] add managed grafana to the dotnet-dnceng-ci pipeline --- azure-pipelines.yml | 2 +- eng/deploy-managed-grafana.yml | 1 - eng/deploy.yaml | 4 ++-- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index cbd977387..97302c9e4 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -244,7 +244,7 @@ extends: -TsaCodebaseName "Dnceng" -TsaPublish $True -PoliCheckAdditionalRunConfigParams @("UserExclusionPath < $(Build.SourcesDirectory)/eng/PoliCheckExclusions.xml")' - - ${{ if in(variables['Build.SourceBranch'], 'refs/heads/main', 'refs/heads/production')}}: + - ${{ if in(variables['Build.SourceBranch'], 'refs/heads/haruna/managed-grafana-new', 'refs/heads/main', 'refs/heads/production')}}: - template: /eng/deploy.yaml@self parameters: ${{ if ne(variables['Build.SourceBranch'], 'refs/heads/production') }}: diff --git a/eng/deploy-managed-grafana.yml b/eng/deploy-managed-grafana.yml index 97379318e..0618a5f5e 100644 --- a/eng/deploy-managed-grafana.yml +++ b/eng/deploy-managed-grafana.yml @@ -97,7 +97,6 @@ stages: - job: PublishDashboards displayName: 'Publish Dashboards to Azure Managed Grafana' dependsOn: - - ProvisionGrafana - SetupToken pool: name: NetCore1ESPool-Internal diff --git a/eng/deploy.yaml b/eng/deploy.yaml index 3e18cd66a..9beaa1470 100644 --- a/eng/deploy.yaml +++ b/eng/deploy.yaml @@ -156,7 +156,7 @@ stages: demands: ImageOverride -equals 1es-windows-2019 dependsOn: - deploy - - ProvisionGrafana + - DeployGrafana variables: - group: ${{ parameters.StatusVariableGroup }} - group: ${{ parameters.GrafanaVariableGroup }} @@ -202,7 +202,7 @@ stages: demands: ImageOverride -equals 1es-windows-2019 dependsOn: - deploy - - ProvisionGrafana + - DeployGrafana jobs: - job: scenario displayName: Scenario tests From 98aa83b266367d18214493b4c26d9e707fdfd3de Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Tue, 6 Jan 2026 17:48:19 -0800 Subject: [PATCH 120/133] remove self hosted grafana dashboard publishing --- azure-pipelines.yml | 6 ------ eng/deploy.yaml | 28 ---------------------------- 2 files changed, 34 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 97302c9e4..474912fb2 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -254,9 +254,6 @@ extends: PublishProfile: Int ServiceConnectionName: NetHelixStaging StatusVariableGroup: DotNetStatus Staging - GrafanaHost: https://dotnet-eng-grafana-staging.westus2.cloudapp.azure.com - GrafanaKeyVault: dotnet-grafana-staging - GrafanaVariableGroup: Dotnet-Grafana-Staging ServiceConnectionClientId: 57f299da-15de-4117-b8f6-7c10451926f0 ServiceConnectionId: 7829de7e-fb4e-4118-8370-475d6bc61905 ${{ else }}: @@ -266,9 +263,6 @@ extends: PublishProfile: Prod ServiceConnectionName: NetHelix StatusVariableGroup: DotNetStatus Production - GrafanaHost: https://dotnet-eng-grafana.westus2.cloudapp.azure.com - GrafanaKeyVault: dotnet-grafana - GrafanaVariableGroup: Dotnet-Grafana-Production ServiceConnectionClientId: fc1eb341-aea4-4a11-8f80-d14b8775b2ba ServiceConnectionId: 4a511f6f-b538-48e6-a389-207e430634d1 diff --git a/eng/deploy.yaml b/eng/deploy.yaml index 9beaa1470..679838ca0 100644 --- a/eng/deploy.yaml +++ b/eng/deploy.yaml @@ -15,12 +15,6 @@ parameters: type: string - name: StatusVariableGroup type: string -- name: GrafanaHost - type: string -- name: GrafanaKeyVault - type: string -- name: GrafanaVariableGroup - type: string # --- Secret Variable group requirements --- # dotnet-build-bot-dotnet-eng-status-token @@ -159,17 +153,6 @@ stages: - DeployGrafana variables: - group: ${{ parameters.StatusVariableGroup }} - - group: ${{ parameters.GrafanaVariableGroup }} - - name: DeploymentEnvironment - value: ${{ parameters.DeploymentEnvironment }} - - name: GrafanaHost - value: ${{ parameters.GrafanaHost }} - - name: GrafanaKeyVault - value: ${{ parameters.GrafanaKeyVault }} - - name: GrafanaClientId - value: ${{ parameters.ServiceConnectionClientId }} - - name: GrafanaServiceConnectionId - value: ${{ parameters.ServiceConnectionId }} jobs: - job: notifyEndDeployment displayName: Notify deployment end @@ -183,17 +166,6 @@ stages: serviceConnection: ${{ parameters.DotNetStatusEndpoint }} method: POST urlSuffix: /dnceng/$(Build.BuildNumber)/end - - job: updateMetrics - displayName: Update Grafana Metrics - steps: - - task: UseDotNet@2 - displayName: Install Correct .NET Version - inputs: - useGlobalJson: true - - script: dotnet publish --configuration Release $(Build.SourcesDirectory)\src\Monitoring\Sdk\Microsoft.DotNet.Monitoring.Sdk.csproj -f net8.0 - displayName: Build Monitoring SDK - - script: dotnet build $(Build.SourcesDirectory)\src\Monitoring\Monitoring.ArcadeServices\Monitoring.ArcadeServices.proj --configuration Release -t:PublishGrafana -p:GrafanaAccessToken=$(grafana-admin-api-key) -p:GrafanaHost=$(GrafanaHost) -p:GrafanaKeyVaultName=$(GrafanaKeyVault) -p:ClientId=$(GrafanaClientId) -p:ServiceConnectionId=$(GrafanaServiceConnectionId) -p:SystemAccessToken=$(System.AccessToken) -p:GrafanaEnvironment=$(DeploymentEnvironment) -p:ParametersFile=parameters.json -v:normal - displayName: Publish Grafana Dashboards - stage: validateDeployment displayName: Validate deployment From c087a9323e0f60feacce8854695fc7ab120d2655 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Tue, 6 Jan 2026 17:50:53 -0800 Subject: [PATCH 121/133] fix grafana dashboard publishing error --- eng/deploy-managed-grafana.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/eng/deploy-managed-grafana.yml b/eng/deploy-managed-grafana.yml index 0618a5f5e..97379318e 100644 --- a/eng/deploy-managed-grafana.yml +++ b/eng/deploy-managed-grafana.yml @@ -97,6 +97,7 @@ stages: - job: PublishDashboards displayName: 'Publish Dashboards to Azure Managed Grafana' dependsOn: + - ProvisionGrafana - SetupToken pool: name: NetCore1ESPool-Internal From dd901b26703a847f2dbc187799b0ecbad7ceaa25 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Tue, 6 Jan 2026 23:37:06 -0800 Subject: [PATCH 122/133] fix error when adding ANG to dotnet-dnceng-ci pipeline --- azure-pipelines-managed-grafana.yml | 18 +++++------ azure-pipelines.yml | 27 ++++++++--------- eng/deploy-managed-grafana.yml | 47 ++++++++++++++--------------- eng/deploy.yaml | 21 +++++++++++++ 4 files changed, 65 insertions(+), 48 deletions(-) diff --git a/azure-pipelines-managed-grafana.yml b/azure-pipelines-managed-grafana.yml index 6b93400cb..53649c219 100644 --- a/azure-pipelines-managed-grafana.yml +++ b/azure-pipelines-managed-grafana.yml @@ -29,14 +29,14 @@ extends: - ${{ if in(variables['Build.SourceBranch'], 'refs/heads/haruna/managed-grafana-new', 'refs/heads/production')}}: - template: /eng/deploy-managed-grafana.yml@self parameters: - ServiceConnectionName: 'Dotnet Engineering services' - ServiceConnectionId: dd8c2cfc-b9c9-452c-a168-ccd4240ada55 - ServiceConnectionClientId: fc1eb341-aea4-4a11-8f80-d14b8775b2ba + AMGServiceConnectionName: 'Dotnet Engineering services' + AMGServiceConnectionId: dd8c2cfc-b9c9-452c-a168-ccd4240ada55 + AMGServiceConnectionClientId: fc1eb341-aea4-4a11-8f80-d14b8775b2ba ${{ if ne(variables['Build.SourceBranch'], 'refs/heads/production') }}: - DeploymentEnvironment: Staging - GrafanaWorkspaceName: dnceng-grafana-staging - GrafanaKeyVault: dnceng-amg-int-kv + AMGDeploymentEnvironment: Staging + AMGGrafanaWorkspaceName: dnceng-grafana-staging + AMGGrafanaKeyVault: dnceng-amg-int-kv ${{ else }}: - DeploymentEnvironment: Production - GrafanaWorkspaceName: dnceng-grafana - GrafanaKeyVault: dnceng-amg-prod-kv + AMGDeploymentEnvironment: Production + AMGGrafanaWorkspaceName: dnceng-grafana + AMGGrafanaKeyVault: dnceng-amg-prod-kv \ No newline at end of file diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 474912fb2..a7015ae9d 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -256,6 +256,12 @@ extends: StatusVariableGroup: DotNetStatus Staging ServiceConnectionClientId: 57f299da-15de-4117-b8f6-7c10451926f0 ServiceConnectionId: 7829de7e-fb4e-4118-8370-475d6bc61905 + AMGServiceConnectionName: 'Dotnet Engineering services' + AMGServiceConnectionId: dd8c2cfc-b9c9-452c-a168-ccd4240ada55 + AMGServiceConnectionClientId: fc1eb341-aea4-4a11-8f80-d14b8775b2ba + AMGDeploymentEnvironment: Staging + AMGGrafanaWorkspaceName: dnceng-grafana-staging + AMGGrafanaKeyVault: dnceng-amg-int-kv ${{ else }}: DeploymentEnvironment: Production DotNetStatusAppName: dotneteng-status @@ -265,18 +271,9 @@ extends: StatusVariableGroup: DotNetStatus Production ServiceConnectionClientId: fc1eb341-aea4-4a11-8f80-d14b8775b2ba ServiceConnectionId: 4a511f6f-b538-48e6-a389-207e430634d1 - - ${{ if in(variables['Build.SourceBranch'], 'refs/heads/haruna/managed-grafana-new', 'refs/heads/production')}}: - - template: /eng/deploy-managed-grafana.yml@self - parameters: - ServiceConnectionName: 'Dotnet Engineering services' - ServiceConnectionId: dd8c2cfc-b9c9-452c-a168-ccd4240ada55 - ServiceConnectionClientId: fc1eb341-aea4-4a11-8f80-d14b8775b2ba - ${{ if ne(variables['Build.SourceBranch'], 'refs/heads/production') }}: - DeploymentEnvironment: Staging - GrafanaWorkspaceName: dnceng-grafana-staging - GrafanaKeyVault: dnceng-amg-int-kv - ${{ else }}: - DeploymentEnvironment: Production - GrafanaWorkspaceName: dnceng-grafana - GrafanaKeyVault: dnceng-amg-prod-kv \ No newline at end of file + AMGServiceConnectionName: 'Dotnet Engineering services' + AMGServiceConnectionId: dd8c2cfc-b9c9-452c-a168-ccd4240ada55 + AMGServiceConnectionClientId: fc1eb341-aea4-4a11-8f80-d14b8775b2ba + AMGDeploymentEnvironment: Production + AMGGrafanaWorkspaceName: dnceng-grafana + AMGGrafanaKeyVault: dnceng-amg-prod-kv diff --git a/eng/deploy-managed-grafana.yml b/eng/deploy-managed-grafana.yml index 97379318e..5615c9b61 100644 --- a/eng/deploy-managed-grafana.yml +++ b/eng/deploy-managed-grafana.yml @@ -1,15 +1,15 @@ parameters: -- name: ServiceConnectionName +- name: AMGServiceConnectionName type: string -- name: ServiceConnectionClientId +- name: AMGServiceConnectionClientId type: string -- name: ServiceConnectionId +- name: AMGServiceConnectionId type: string -- name: DeploymentEnvironment +- name: AMGDeploymentEnvironment type: string -- name: GrafanaWorkspaceName +- name: AMGGrafanaWorkspaceName type: string -- name: GrafanaKeyVault +- name: AMGGrafanaKeyVault type: string stages: @@ -18,13 +18,12 @@ stages: jobs: - template: /eng/provision-grafana.yaml@self parameters: - DeploymentEnvironment: ${{ parameters.DeploymentEnvironment }} - ServiceConnectionName: ${{ parameters.ServiceConnectionName }} + DeploymentEnvironment: ${{ parameters.AMGDeploymentEnvironment }} + ServiceConnectionName: ${{ parameters.AMGServiceConnectionName }} GrafanaResourceGroup: 'monitoring-managed' - GrafanaWorkspaceName: ${{ parameters.GrafanaWorkspaceName }} + GrafanaWorkspaceName: ${{ parameters.AMGGrafanaWorkspaceName }} GrafanaLocation: 'westus2' - GrafanaKeyVault: ${{ parameters.GrafanaKeyVault }} - + GrafanaKeyVault: ${{ parameters.AMGGrafanaKeyVault }} - job: SetupToken dependsOn: ProvisionGrafana displayName: 'Setup Grafana API Token' @@ -37,13 +36,13 @@ stages: - task: AzureCLI@2 displayName: 'Grant Pipeline Service Principal Grafana Admin Role' inputs: - azureSubscription: ${{ parameters.ServiceConnectionName }} + azureSubscription: ${{ parameters.AMGServiceConnectionName }} scriptType: 'pscore' scriptLocation: 'inlineScript' inlineScript: | Write-Host "Granting pipeline service principal Grafana Admin role..." - $workspaceName = "${{ parameters.GrafanaWorkspaceName }}" + $workspaceName = "${{ parameters.AMGGrafanaWorkspaceName }}" $rgName = "monitoring-managed" # Get the current service principal object ID @@ -86,13 +85,13 @@ stages: - task: AzureCLI@2 displayName: 'Create or Validate Grafana API Token' inputs: - azureSubscription: ${{ parameters.ServiceConnectionName }} + azureSubscription: ${{ parameters.AMGServiceConnectionName }} scriptType: 'pscore' scriptLocation: 'scriptPath' scriptPath: 'eng/setup-grafana-api-token.ps1' arguments: >- - -Environment "${{ parameters.DeploymentEnvironment }}" - -KeyVaultName "${{ parameters.GrafanaKeyVault }}" + -Environment "${{ parameters.AMGDeploymentEnvironment }}" + -KeyVaultName "${{ parameters.AMGGrafanaKeyVault }}" - job: PublishDashboards displayName: 'Publish Dashboards to Azure Managed Grafana' @@ -117,7 +116,7 @@ stages: - task: AzureCLI@2 displayName: 'Publish Grafana Dashboards' inputs: - azureSubscription: ${{ parameters.ServiceConnectionName }} + azureSubscription: ${{ parameters.AMGServiceConnectionName }} scriptType: 'pscore' scriptLocation: 'inlineScript' addSpnToEnvironment: true @@ -126,7 +125,7 @@ stages: Write-Host "Publishing Dashboards to Azure Managed Grafana" Write-Host "==========================================" Write-Host "Grafana Endpoint: $(GrafanaEndpoint)" - Write-Host "Environment: ${{ parameters.DeploymentEnvironment }}" + Write-Host "Environment: ${{ parameters.AMGDeploymentEnvironment }}" Write-Host "" # Get the API token from Key Vault with retry logic for RBAC propagation @@ -140,7 +139,7 @@ stages: while (-not $apiToken -and $retryCount -lt $maxRetries) { try { - $apiToken = az keyvault secret show --vault-name "${{ parameters.GrafanaKeyVault }}" --name $tokenSecretName --query "value" --output tsv 2>&1 + $apiToken = az keyvault secret show --vault-name "${{ parameters.AMGGrafanaKeyVault }}" --name $tokenSecretName --query "value" --output tsv 2>&1 if ($LASTEXITCODE -eq 0 -and $apiToken -and $apiToken.Trim()) { Write-Host "āœ“ API token retrieved successfully from Key Vault" @@ -157,7 +156,7 @@ stages: } else { Write-Error "Unable to retrieve API token after $maxRetries attempts ($($maxRetries * $waitSeconds) seconds total)" Write-Error "Secret name: $tokenSecretName" - Write-Error "Key Vault: ${{ parameters.GrafanaKeyVault }}" + Write-Error "Key Vault: ${{ parameters.AMGGrafanaKeyVault }}" Write-Error "" Write-Error "Possible causes:" Write-Error "1. RBAC permissions haven't propagated yet (can take 5-10 minutes)" @@ -179,11 +178,11 @@ stages: -t:PublishGrafana ` -p:GrafanaAccessToken=$apiToken ` -p:GrafanaHost="$(GrafanaEndpoint)" ` - -p:GrafanaKeyVaultName="${{ parameters.GrafanaKeyVault }}" ` - -p:GrafanaEnvironment="${{ parameters.DeploymentEnvironment }}" ` + -p:GrafanaKeyVaultName="${{ parameters.AMGGrafanaKeyVault }}" ` + -p:GrafanaEnvironment="${{ parameters.AMGDeploymentEnvironment }}" ` -p:ParametersFile=parameters.json ` - -p:ClientId="${{ parameters.ServiceConnectionClientId }}" ` - -p:ServiceConnectionId="${{ parameters.ServiceConnectionId }}" ` + -p:ClientId="${{ parameters.AMGServiceConnectionClientId }}" ` + -p:ServiceConnectionId="${{ parameters.AMGServiceConnectionId }}" ` -p:SystemAccessToken="$(System.AccessToken)" ` -v:normal diff --git a/eng/deploy.yaml b/eng/deploy.yaml index 679838ca0..57a8b5ded 100644 --- a/eng/deploy.yaml +++ b/eng/deploy.yaml @@ -15,6 +15,18 @@ parameters: type: string - name: StatusVariableGroup type: string +- name: AMGServiceConnectionName + type: string +- name: AMGServiceConnectionClientId + type: string +- name: AMGServiceConnectionId + type: string +- name: AMGDeploymentEnvironment + type: string +- name: AMGGrafanaWorkspaceName + type: string +- name: AMGGrafanaKeyVault + type: string # --- Secret Variable group requirements --- # dotnet-build-bot-dotnet-eng-status-token @@ -143,6 +155,15 @@ stages: DeploymentType: zipDeploy RemoveAdditionalFilesFlag: true +- template: /deploy-managed-grafana.yml@self + parameters: + AMGServiceConnectionName: ${{ parameters.AMGServiceConnectionName }} + AMGServiceConnectionId: ${{ parameters.AMGServiceConnectionId }} + AMGServiceConnectionClientId: ${{ parameters.AMGServiceConnectionClientId }} + AMGDeploymentEnvironment: ${{ parameters.AMGDeploymentEnvironment }} + AMGGrafanaWorkspaceName: ${{ parameters.AMGGrafanaWorkspaceName }} + AMGGrafanaKeyVault: ${{ parameters.AMGGrafanaKeyVault }} + - stage: postdeploy displayName: Post-Deployment pool: From 6eaa43083f4db99539b10ef52c4a5c77d2c29f68 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Wed, 7 Jan 2026 00:27:33 -0800 Subject: [PATCH 123/133] remove test pipeline --- azure-pipelines-managed-grafana.yml | 2 +- eng/deploy-managed-grafana.yml | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/azure-pipelines-managed-grafana.yml b/azure-pipelines-managed-grafana.yml index 53649c219..69ea60d07 100644 --- a/azure-pipelines-managed-grafana.yml +++ b/azure-pipelines-managed-grafana.yml @@ -2,7 +2,7 @@ trigger: batch: true branches: include: - - haruna/managed-grafana-new + # - haruna/managed-grafana-new - production pr: none diff --git a/eng/deploy-managed-grafana.yml b/eng/deploy-managed-grafana.yml index 5615c9b61..de9e2368d 100644 --- a/eng/deploy-managed-grafana.yml +++ b/eng/deploy-managed-grafana.yml @@ -15,6 +15,12 @@ parameters: stages: - stage: DeployGrafana displayName: 'Deploy Grafana Infrastructure and Dashboards' + pool: + name: NetCore1ESPool-Internal-NoMSI + demands: ImageOverride -equals 1es-windows-2019 + dependsOn: + - predeploy + - approval jobs: - template: /eng/provision-grafana.yaml@self parameters: From 89c3cf816ab33fe0e4d52ce45e0b0e89b0d7f340 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Wed, 7 Jan 2026 00:33:33 -0800 Subject: [PATCH 124/133] fix deploy-managed-grafana.ml filepath --- eng/deploy.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eng/deploy.yaml b/eng/deploy.yaml index 57a8b5ded..15e326c91 100644 --- a/eng/deploy.yaml +++ b/eng/deploy.yaml @@ -155,7 +155,7 @@ stages: DeploymentType: zipDeploy RemoveAdditionalFilesFlag: true -- template: /deploy-managed-grafana.yml@self +- template: /eng/deploy-managed-grafana.yml@self parameters: AMGServiceConnectionName: ${{ parameters.AMGServiceConnectionName }} AMGServiceConnectionId: ${{ parameters.AMGServiceConnectionId }} From 382350c87e7957a612a621396e2630a3e8d4c5e0 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Wed, 7 Jan 2026 00:41:51 -0800 Subject: [PATCH 125/133] include serviceConnectionName variable --- azure-pipelines-pr.yml | 5 +---- azure-pipelines.yml | 2 ++ 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/azure-pipelines-pr.yml b/azure-pipelines-pr.yml index 29e8692a9..53cb10166 100644 --- a/azure-pipelines-pr.yml +++ b/azure-pipelines-pr.yml @@ -7,10 +7,7 @@ pr: - production variables: - ${{ if eq(variables['System.PullRequest.TargetBranch'], 'refs/heads/production') }}: - ServiceConnectionName: 'dnceng-managed-grafana' - ${{ else }}: - ServiceConnectionName: 'dnceng-managed-grafana-staging' + ServiceConnectionName: 'Dotnet Engineering services' stages: - stage: build diff --git a/azure-pipelines.yml b/azure-pipelines.yml index a7015ae9d..c053a952a 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -7,6 +7,8 @@ variables: - name: _DotNetArtifactsCategory value: .NETCore - group: SDL_Settings + - name: ServiceConnectionName + value: 'Dotnet Engineering services' trigger: batch: true From 5e64cdb721308fee8ee84fd81a9db1c6de3bc64e Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Fri, 9 Jan 2026 11:17:46 -0800 Subject: [PATCH 126/133] Allow anonymous access to alert webhook endpoint for Grafana --- .../DotNet.Status.Web/Controllers/AlertHookController.cs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/DotNet.Status.Web/DotNet.Status.Web/Controllers/AlertHookController.cs b/src/DotNet.Status.Web/DotNet.Status.Web/Controllers/AlertHookController.cs index f9a9f1dec..5620acc7a 100644 --- a/src/DotNet.Status.Web/DotNet.Status.Web/Controllers/AlertHookController.cs +++ b/src/DotNet.Status.Web/DotNet.Status.Web/Controllers/AlertHookController.cs @@ -9,6 +9,7 @@ using System.Threading.Tasks; using DotNet.Status.Web.Models; using DotNet.Status.Web.Options; +using Microsoft.AspNetCore.Authorization; using Microsoft.AspNetCore.Mvc; using Microsoft.DotNet.GitHub.Authentication; using Microsoft.Extensions.Logging; @@ -19,6 +20,7 @@ namespace DotNet.Status.Web.Controllers; [ApiController] [Route("api/alert")] +[AllowAnonymous] public class AlertHookController : ControllerBase { public const string NotificationIdLabel = "Grafana Alert"; From 840b1e30050e2f38cd016ca136966be0ae7a4b95 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Tue, 13 Jan 2026 10:36:14 -0800 Subject: [PATCH 127/133] remove AllowAnonymous from alertHookController --- .../DotNet.Status.Web/Controllers/AlertHookController.cs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/DotNet.Status.Web/DotNet.Status.Web/Controllers/AlertHookController.cs b/src/DotNet.Status.Web/DotNet.Status.Web/Controllers/AlertHookController.cs index 5620acc7a..2db2c1129 100644 --- a/src/DotNet.Status.Web/DotNet.Status.Web/Controllers/AlertHookController.cs +++ b/src/DotNet.Status.Web/DotNet.Status.Web/Controllers/AlertHookController.cs @@ -20,7 +20,6 @@ namespace DotNet.Status.Web.Controllers; [ApiController] [Route("api/alert")] -[AllowAnonymous] public class AlertHookController : ControllerBase { public const string NotificationIdLabel = "Grafana Alert"; From 75f20912849ffcb810a2a9114d42d7dd3aa34d6c Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Tue, 13 Jan 2026 11:15:21 -0800 Subject: [PATCH 128/133] delete azure-pipelines-managed-grafana pipeline --- azure-pipelines-managed-grafana.yml | 42 ----------------------------- azure-pipelines.yml | 2 +- 2 files changed, 1 insertion(+), 43 deletions(-) delete mode 100644 azure-pipelines-managed-grafana.yml diff --git a/azure-pipelines-managed-grafana.yml b/azure-pipelines-managed-grafana.yml deleted file mode 100644 index 69ea60d07..000000000 --- a/azure-pipelines-managed-grafana.yml +++ /dev/null @@ -1,42 +0,0 @@ -trigger: - batch: true - branches: - include: - # - haruna/managed-grafana-new - - production -pr: none - -resources: - repositories: - - repository: 1ESPipelineTemplates - type: git - name: 1ESPipelineTemplates/1ESPipelineTemplates - ref: refs/tags/release -extends: - template: v1/1ES.Official.PipelineTemplate.yml@1ESPipelineTemplates - parameters: - pool: - name: NetCore1ESPool-Internal - image: 1es-windows-2022 - os: windows - sdl: - policheck: - enabled: true - tsa: - enabled: true - - stages: - - ${{ if in(variables['Build.SourceBranch'], 'refs/heads/haruna/managed-grafana-new', 'refs/heads/production')}}: - - template: /eng/deploy-managed-grafana.yml@self - parameters: - AMGServiceConnectionName: 'Dotnet Engineering services' - AMGServiceConnectionId: dd8c2cfc-b9c9-452c-a168-ccd4240ada55 - AMGServiceConnectionClientId: fc1eb341-aea4-4a11-8f80-d14b8775b2ba - ${{ if ne(variables['Build.SourceBranch'], 'refs/heads/production') }}: - AMGDeploymentEnvironment: Staging - AMGGrafanaWorkspaceName: dnceng-grafana-staging - AMGGrafanaKeyVault: dnceng-amg-int-kv - ${{ else }}: - AMGDeploymentEnvironment: Production - AMGGrafanaWorkspaceName: dnceng-grafana - AMGGrafanaKeyVault: dnceng-amg-prod-kv \ No newline at end of file diff --git a/azure-pipelines.yml b/azure-pipelines.yml index dfd97225e..f4aa6187f 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -235,7 +235,7 @@ extends: -TsaCodebaseName "Dnceng" -TsaPublish $True -PoliCheckAdditionalRunConfigParams @("UserExclusionPath < $(Build.SourcesDirectory)/eng/PoliCheckExclusions.xml")' - - ${{ if in(variables['Build.SourceBranch'], 'refs/heads/haruna/managed-grafana-new', 'refs/heads/main', 'refs/heads/production')}}: + - ${{ if in(variables['Build.SourceBranch'], 'refs/heads/main', 'refs/heads/production')}}: - template: /eng/deploy.yaml@self parameters: ${{ if ne(variables['Build.SourceBranch'], 'refs/heads/production') }}: From f4067660ba2247c006708c85a2770fac5e05446c Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Tue, 13 Jan 2026 15:31:16 -0800 Subject: [PATCH 129/133] remove unnecessary comment --- .vault-config/dnceng-amg-int-kv.yaml | 4 +--- .vault-config/dnceng-amg-prod-kv.yaml | 2 -- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/.vault-config/dnceng-amg-int-kv.yaml b/.vault-config/dnceng-amg-int-kv.yaml index c0ebb0381..8648a73b8 100644 --- a/.vault-config/dnceng-amg-int-kv.yaml +++ b/.vault-config/dnceng-amg-int-kv.yaml @@ -4,9 +4,7 @@ storageLocation: subscription: a4fc5514-21a9-4296-bfaf-5c7ee7fa35d1 name: dnceng-amg-int-kv -secrets: - # Copy only the secrets needed for Azure Managed Grafana datasources and notifications - +secrets: # API token for DotNet Status website dotnet-build-bot-dotnet-eng-status-token: type: text diff --git a/.vault-config/dnceng-amg-prod-kv.yaml b/.vault-config/dnceng-amg-prod-kv.yaml index c6effbac6..c20915d7d 100644 --- a/.vault-config/dnceng-amg-prod-kv.yaml +++ b/.vault-config/dnceng-amg-prod-kv.yaml @@ -5,8 +5,6 @@ storageLocation: name: dnceng-amg-prod-kv secrets: - # Copy only the secrets needed for Azure Managed Grafana datasources and notifications - # API token for DotNet Status website dotnet-build-bot-dotnet-eng-status-token: type: text From 93896b0c41ec2774e270db616db4a4f4c964788f Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Tue, 13 Jan 2026 16:09:03 -0800 Subject: [PATCH 130/133] fix service connection naming --- azure-pipelines-pr.yml | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/azure-pipelines-pr.yml b/azure-pipelines-pr.yml index 7f30811bd..ead7feaf9 100644 --- a/azure-pipelines-pr.yml +++ b/azure-pipelines-pr.yml @@ -6,9 +6,6 @@ pr: - main - production -variables: - ServiceConnectionName: 'Dotnet Engineering services' - stages: - stage: build dependsOn: [] @@ -113,7 +110,7 @@ stages: - task: AzureCLI@2 displayName: 'Validate Grafana Bicep Template' inputs: - azureSubscription: '$(ServiceConnectionName)' + azureSubscription: Dotnet Engineering services scriptType: 'ps' scriptLocation: 'inlineScript' inlineScript: | From 554a3e20fe62f7616ee53d9ae49b331c5bdcca2a Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Tue, 13 Jan 2026 20:37:40 -0800 Subject: [PATCH 131/133] remove Validation of the Grafana Bicep Template from the PR stage --- azure-pipelines-pr.yml | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/azure-pipelines-pr.yml b/azure-pipelines-pr.yml index ead7feaf9..9f74245af 100644 --- a/azure-pipelines-pr.yml +++ b/azure-pipelines-pr.yml @@ -106,21 +106,3 @@ stages: displayName: Verify Secret Usages - template: /eng/test.yaml - - - task: AzureCLI@2 - displayName: 'Validate Grafana Bicep Template' - inputs: - azureSubscription: Dotnet Engineering services - scriptType: 'ps' - scriptLocation: 'inlineScript' - inlineScript: | - Write-Host "Validating Grafana Bicep template..." - if (!(Test-Path "eng/deployment/azure-managed-grafana.bicep")) { - throw "Bicep template not found: azure-managed-grafana.bicep" - } - - az bicep build --file eng/deployment/azure-managed-grafana.bicep - if ($LASTEXITCODE -ne 0) { - throw "Bicep template validation failed" - } - Write-Host "SUCCESS: Bicep template validation successful" \ No newline at end of file From b42bc65619c79c99012c3c7b8c95958094c5a9aa Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Thu, 15 Jan 2026 09:55:17 -0800 Subject: [PATCH 132/133] add azure managed grafana api key to secret manager --- .vault-config/dnceng-amg-int-kv.yaml | 7 +++ .vault-config/dnceng-amg-prod-kv.yaml | 7 +++ .../SecretTypes/AzureManagedGrafanaApiKey.cs | 51 +++++++++++++++++++ 3 files changed, 65 insertions(+) create mode 100644 src/SecretManager/Microsoft.DncEng.SecretManager/SecretTypes/AzureManagedGrafanaApiKey.cs diff --git a/.vault-config/dnceng-amg-int-kv.yaml b/.vault-config/dnceng-amg-int-kv.yaml index 8648a73b8..5f1215876 100644 --- a/.vault-config/dnceng-amg-int-kv.yaml +++ b/.vault-config/dnceng-amg-int-kv.yaml @@ -5,6 +5,13 @@ storageLocation: name: dnceng-amg-int-kv secrets: + # API token for Azure Managed Grafana (staging) + grafana-admin-api-key: + type: azure-managed-grafana-api-key + parameters: + description: Service account token for Azure Managed Grafana staging workspace + environment: staging + # API token for DotNet Status website dotnet-build-bot-dotnet-eng-status-token: type: text diff --git a/.vault-config/dnceng-amg-prod-kv.yaml b/.vault-config/dnceng-amg-prod-kv.yaml index c20915d7d..0180abc55 100644 --- a/.vault-config/dnceng-amg-prod-kv.yaml +++ b/.vault-config/dnceng-amg-prod-kv.yaml @@ -5,6 +5,13 @@ storageLocation: name: dnceng-amg-prod-kv secrets: + # API token for Azure Managed Grafana (staging) + grafana-admin-api-key: + type: azure-managed-grafana-api-key + parameters: + description: Service account token for Azure Managed Grafana staging workspace + environment: production + # API token for DotNet Status website dotnet-build-bot-dotnet-eng-status-token: type: text diff --git a/src/SecretManager/Microsoft.DncEng.SecretManager/SecretTypes/AzureManagedGrafanaApiKey.cs b/src/SecretManager/Microsoft.DncEng.SecretManager/SecretTypes/AzureManagedGrafanaApiKey.cs new file mode 100644 index 000000000..04410f331 --- /dev/null +++ b/src/SecretManager/Microsoft.DncEng.SecretManager/SecretTypes/AzureManagedGrafanaApiKey.cs @@ -0,0 +1,51 @@ +using System; +using System.Collections.Generic; +using System.Globalization; +using Microsoft.DncEng.CommandLineLib; + +namespace Microsoft.DncEng.SecretManager.SecretTypes; + +[Name("azure-managed-grafana-api-key")] +public class AzureManagedGrafanaApiKey : GenericAccessToken +{ + private readonly string[] _expirationDateFormats = new[] { "yyyy-MM-dd", "yyyy-MM-dd HH:mm:ss" }; + + protected override string HelpMessage => "Please login to https://{0} and navigate to Administration > Service accounts to create a new service account token."; + protected override string TokenName => "Azure Managed Grafana API key"; + protected override string TokenFormatDescription => "Service account token (starts with 'glsa_')"; + protected override string ExpirationFormatDescription => "format yyyy-MM-dd followed by optional time part hh:mm:ss or empty for no expiration"; + protected override bool HasExpiration => true; + + protected override IEnumerable> EnvironmentToHost => new[] + { + new KeyValuePair( "production", "https://dnceng-grafana-eraubnb4dkatgnfn.wus2.grafana.azure.com/" ), + new KeyValuePair( "staging", "https://dnceng-grafana-staging-faf3f3ebf0f8afbm.wus2.grafana.azure.com/" ) + }; + + public AzureManagedGrafanaApiKey(ISystemClock clock, IConsole console) : base(clock, console) + { + } + + protected override bool TryParseExpirationDate(string value, out DateTime parsedValue) + { + if (string.IsNullOrWhiteSpace(value)) + { + parsedValue = DateTime.MaxValue; + return true; + } + return DateTime.TryParseExact(value, _expirationDateFormats, CultureInfo.InvariantCulture, DateTimeStyles.None, out parsedValue); + } + + protected override bool ValidateToken(string token) + { + // Azure Managed Grafana service account tokens start with "glsa_" + if (token.StartsWith("glsa_", StringComparison.Ordinal)) + { + Console.WriteLine("Azure Managed Grafana service account token validated successfully."); + return true; + } + + Console.WriteLine("Invalid token format. Azure Managed Grafana tokens must start with 'glsa_'."); + return false; + } +} From 708dd927e1c03e9bcdb40403ec37ea0852531f86 Mon Sep 17 00:00:00 2001 From: Haruna Ogweda Date: Thu, 22 Jan 2026 18:15:01 -0800 Subject: [PATCH 133/133] remove unused files --- .vault-config/dnceng-amg-int-kv.yaml | 7 - .vault-config/dnceng-amg-prod-kv.yaml | 7 - eng/generate-appgw-cert.ps1 | 169 --------------- eng/provision-grafana.yaml | 2 +- .../Controllers/AlertHookController.cs | 1 - .../Controllers/AnnotationsController.cs | 5 - .../alertrules/README.md | 199 ------------------ .../Deployment Annotations.datasource.json | 25 --- .../Deployment Annotations.datasource.json | 25 --- src/Monitoring/Sdk/GrafanaClient.cs | 1 - .../SecretTypes/AzureManagedGrafanaApiKey.cs | 51 ----- 11 files changed, 1 insertion(+), 491 deletions(-) delete mode 100644 eng/generate-appgw-cert.ps1 delete mode 100644 src/Monitoring/Monitoring.ArcadeServices/alertrules/README.md delete mode 100644 src/Monitoring/Monitoring.ArcadeServices/datasource/Production/Deployment Annotations.datasource.json delete mode 100644 src/Monitoring/Monitoring.ArcadeServices/datasource/Staging/Deployment Annotations.datasource.json delete mode 100644 src/SecretManager/Microsoft.DncEng.SecretManager/SecretTypes/AzureManagedGrafanaApiKey.cs diff --git a/.vault-config/dnceng-amg-int-kv.yaml b/.vault-config/dnceng-amg-int-kv.yaml index 5f1215876..8648a73b8 100644 --- a/.vault-config/dnceng-amg-int-kv.yaml +++ b/.vault-config/dnceng-amg-int-kv.yaml @@ -5,13 +5,6 @@ storageLocation: name: dnceng-amg-int-kv secrets: - # API token for Azure Managed Grafana (staging) - grafana-admin-api-key: - type: azure-managed-grafana-api-key - parameters: - description: Service account token for Azure Managed Grafana staging workspace - environment: staging - # API token for DotNet Status website dotnet-build-bot-dotnet-eng-status-token: type: text diff --git a/.vault-config/dnceng-amg-prod-kv.yaml b/.vault-config/dnceng-amg-prod-kv.yaml index 0180abc55..c20915d7d 100644 --- a/.vault-config/dnceng-amg-prod-kv.yaml +++ b/.vault-config/dnceng-amg-prod-kv.yaml @@ -5,13 +5,6 @@ storageLocation: name: dnceng-amg-prod-kv secrets: - # API token for Azure Managed Grafana (staging) - grafana-admin-api-key: - type: azure-managed-grafana-api-key - parameters: - description: Service account token for Azure Managed Grafana staging workspace - environment: production - # API token for DotNet Status website dotnet-build-bot-dotnet-eng-status-token: type: text diff --git a/eng/generate-appgw-cert.ps1 b/eng/generate-appgw-cert.ps1 deleted file mode 100644 index c0282c413..000000000 --- a/eng/generate-appgw-cert.ps1 +++ /dev/null @@ -1,169 +0,0 @@ -#!/usr/bin/env pwsh - -param( - [Parameter(Mandatory = $true)] - [string]$DnsName, - - [Parameter(Mandatory = $true)] - [string]$KeyVaultName, - - [Parameter(Mandatory = $false)] - [string]$CertificateName = "appgw-ssl-cert", - - [Parameter(Mandatory = $true)] - [string]$ResourceGroupName, - - [Parameter(Mandatory = $false)] - [string]$Location = "westus2" -) - -$ErrorActionPreference = "Stop" - -Write-Host "================================================" -ForegroundColor Cyan -Write-Host "Azure Key Vault Certificate Setup" -ForegroundColor Cyan -Write-Host "================================================" -ForegroundColor Cyan -Write-Host "" -Write-Host "DNS Name: $DnsName" -ForegroundColor White -Write-Host "Key Vault: $KeyVaultName" -ForegroundColor White -Write-Host "Certificate: $CertificateName" -ForegroundColor White -Write-Host "Resource Group: $ResourceGroupName" -ForegroundColor White -Write-Host "" - -# Check if Key Vault exists (should already exist from Grafana provisioning) -Write-Host "Verifying Key Vault exists..." -ForegroundColor Yellow -$kvExists = az keyvault show --name $KeyVaultName --resource-group $ResourceGroupName 2>$null - -if (!$kvExists) { - Write-Error "Key Vault '$KeyVaultName' not found. It should have been created during Grafana provisioning." - Write-Host "Expected Key Vault names:" -ForegroundColor Yellow - Write-Host " Production: dnceng-amg-prod-kv" -ForegroundColor White - Write-Host " Staging: dnceng-amg-int-kv" -ForegroundColor White - exit 1 -} - -Write-Host "āœ“ Key Vault exists (from Grafana provisioning)" -ForegroundColor Green - -# Check if certificate already exists -Write-Host "" -Write-Host "Checking if certificate exists in Key Vault..." -ForegroundColor Yellow -$certExists = az keyvault certificate show ` - --vault-name $KeyVaultName ` - --name $CertificateName ` - --query "id" ` - --output tsv 2>$null - -if ($certExists) { - Write-Host "āœ“ Certificate '$CertificateName' already exists" -ForegroundColor Green - Write-Host " Using existing certificate" -ForegroundColor White -} else { - Write-Host "Certificate not found. Creating self-signed certificate..." -ForegroundColor Yellow - - # Create certificate policy for self-signed cert - $policy = @" -{ - "issuerParameters": { - "name": "Self" - }, - "x509CertificateProperties": { - "subject": "CN=$DnsName", - "subjectAlternativeNames": { - "dnsNames": ["$DnsName"] - }, - "validityInMonths": 12, - "keyUsage": [ - "digitalSignature", - "keyEncipherment" - ], - "ekus": [ - "1.3.6.1.5.5.7.3.1" - ] - }, - "keyProperties": { - "exportable": true, - "keyType": "RSA", - "keySize": 2048, - "reuseKey": false - }, - "secretProperties": { - "contentType": "application/x-pkcs12" - } -} -"@ - - $policyFile = Join-Path $env:TEMP "cert-policy-$([Guid]::NewGuid()).json" - $policy | Out-File -FilePath $policyFile -Encoding UTF8 - - # Create certificate in Key Vault - Write-Host "Creating certificate in Key Vault (this may take 10-15 seconds)..." -ForegroundColor Yellow - - az keyvault certificate create ` - --vault-name $KeyVaultName ` - --name $CertificateName ` - --policy "@$policyFile" ` - --output none - - if ($LASTEXITCODE -ne 0) { - Write-Error "Failed to create certificate in Key Vault" - Remove-Item $policyFile -Force - exit 1 - } - - Remove-Item $policyFile -Force - - Write-Host "āœ“ Self-signed certificate created successfully" -ForegroundColor Green -} - -# Get certificate secret ID (for Application Gateway) -Write-Host "" -Write-Host "Retrieving certificate secret ID..." -ForegroundColor Yellow - -$secretId = az keyvault certificate show ` - --vault-name $KeyVaultName ` - --name $CertificateName ` - --query "sid" ` - --output tsv - -if ([string]::IsNullOrEmpty($secretId)) { - Write-Error "Failed to retrieve certificate secret ID" - exit 1 -} - -# Get unversioned secret ID (recommended for App Gateway) -$unversionedSecretId = $secretId -replace '/[^/]+$', '' - -Write-Host "āœ“ Certificate secret ID retrieved" -ForegroundColor Green -Write-Host "" -Write-Host "================================================" -ForegroundColor Cyan -Write-Host "Certificate Details" -ForegroundColor Cyan -Write-Host "================================================" -ForegroundColor Cyan -Write-Host "" -Write-Host "Secret ID (versioned):" -ForegroundColor White -Write-Host " $secretId" -ForegroundColor Gray -Write-Host "" -Write-Host "Secret ID (unversioned - recommended):" -ForegroundColor White -Write-Host " $unversionedSecretId" -ForegroundColor Gray -Write-Host "" - -# Get certificate details -$certDetails = az keyvault certificate show ` - --vault-name $KeyVaultName ` - --name $CertificateName ` - --output json | ConvertFrom-Json - -$thumbprint = $certDetails.x509Thumbprint -$expiryDate = $certDetails.attributes.expires -$issuer = $certDetails.policy.issuerParameters.name - -Write-Host "Thumbprint: $thumbprint" -ForegroundColor White -Write-Host "Issuer: $issuer" -ForegroundColor White -Write-Host "Expires: $expiryDate" -ForegroundColor White -Write-Host "" - -# Output for pipeline use -Write-Host "Setting pipeline variables..." -ForegroundColor Yellow -Write-Host "##vso[task.setvariable variable=KeyVaultSecretId]$unversionedSecretId" -Write-Host "##vso[task.setvariable variable=CertificateThumbprint]$thumbprint" -Write-Host "##vso[task.setvariable variable=KeyVaultName]$KeyVaultName" - -Write-Host "" -Write-Host "āœ“ Certificate setup complete!" -ForegroundColor Green diff --git a/eng/provision-grafana.yaml b/eng/provision-grafana.yaml index 2aa67a0d9..122422601 100644 --- a/eng/provision-grafana.yaml +++ b/eng/provision-grafana.yaml @@ -97,7 +97,7 @@ jobs: deploymentOutputs: 'grafanaOutputs' - task: PowerShell@2 - displayName: 'Export Grafana Identity for App Gateway' + displayName: 'Export Grafana Identity' name: ExportIdentity inputs: targetType: 'inline' diff --git a/src/DotNet.Status.Web/DotNet.Status.Web/Controllers/AlertHookController.cs b/src/DotNet.Status.Web/DotNet.Status.Web/Controllers/AlertHookController.cs index 2db2c1129..f9a9f1dec 100644 --- a/src/DotNet.Status.Web/DotNet.Status.Web/Controllers/AlertHookController.cs +++ b/src/DotNet.Status.Web/DotNet.Status.Web/Controllers/AlertHookController.cs @@ -9,7 +9,6 @@ using System.Threading.Tasks; using DotNet.Status.Web.Models; using DotNet.Status.Web.Options; -using Microsoft.AspNetCore.Authorization; using Microsoft.AspNetCore.Mvc; using Microsoft.DotNet.GitHub.Authentication; using Microsoft.Extensions.Logging; diff --git a/src/DotNet.Status.Web/DotNet.Status.Web/Controllers/AnnotationsController.cs b/src/DotNet.Status.Web/DotNet.Status.Web/Controllers/AnnotationsController.cs index b82500651..329bfec26 100644 --- a/src/DotNet.Status.Web/DotNet.Status.Web/Controllers/AnnotationsController.cs +++ b/src/DotNet.Status.Web/DotNet.Status.Web/Controllers/AnnotationsController.cs @@ -143,11 +143,6 @@ public async Task>> Post(AnnotationQue return annotationEntries; } - /// - /// Native Grafana annotations endpoint. Returns annotations in the format expected by - /// Grafana's built-in annotation queries. - /// Supports both POST with body and GET with query parameters. - /// [HttpPost] [HttpGet] [Route("grafana")] diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/README.md b/src/Monitoring/Monitoring.ArcadeServices/alertrules/README.md deleted file mode 100644 index 86cc3ffbb..000000000 --- a/src/Monitoring/Monitoring.ArcadeServices/alertrules/README.md +++ /dev/null @@ -1,199 +0,0 @@ -# Alert Migration Status - -## āœ… Completed - -### SDK Implementation -- āœ… Added `CreateAlertRuleAsync()` to GrafanaClient.cs -- āœ… Added `PostAlertRulesAsync()` to DeployPublisher.cs -- āœ… Integrated alert rule provisioning into PublishGrafana pipeline -- āœ… Created alertrules directory structure - -### Alert Rules Created -1. āœ… `pcs-work-item-success-rate.alert.json` - Monitors PCS work item success rate, alerts when < 74% -2. āœ… `pcs-exceptions-high.alert.json` - Monitors exception count, alerts when > 15 exceptions - -## šŸ“‹ Remaining Alerts to Convert - -### From arcadeAvailability.dashboard.json -3. ā³ PCS Background Worker Stopped - Alerts when work item processing stops (< 20 items) -4. ā³ PCS Disk Space Issues alert - Monitors disk space availability -5. ā³ Git Push success rate alert - Tracks git operation success -6. ā³ Container job execution failures alert - Azure DevOps pipeline failures -7. ā³ Helix API availability - API health check -8. ā³ Helix API Average Response Time - Performance monitoring -9. ā³ Helix AutoScaler Service Stopped Running - Service health -10. ā³ DotNetEng Status Failed Requests/Hour alert - HTTP error tracking -11. ā³ source.dot.net Availability - Website uptime - -### From quota.dashboard.json -12. ā³ Alert 1 (TBD - need to extract) -13. ā³ Alert 2 (TBD - need to extract) -14. ā³ Alert 3 (TBD - need to extract) -15. ā³ Alert 4 (TBD - need to extract) - -## šŸ”„ Alert Migration Process - -Each alert requires: - -1. **Extract from dashboard JSON** - - Find the panel with `"alert": {}` block - - Extract `alert.name`, `alert.message`, `alert.conditions`, `alert.notifications` - - Extract `targets` array (queries) - -2. **Convert to unified alerting format** - - Create new `.alert.json` file with kebab-case uid - - Convert queries to `data` array - - Add reduce expression (refId: B) - extracts last value from time series - - Add threshold expression (refId: C) - applies condition - - Map state: `keep_state` → `KeepLast`, `ok` → `OK`, `alerting` → `Alerting` - - Convert `for` duration (e.g., "5m") - - Convert `frequency` to `intervalSeconds` (e.g., "1m" → 60) - - Move `alertRuleTags` to `labels` - - Move `message` to `annotations.description` - - Reference `folderUID`: "arcade-services" - -3. **Handle notifications** - - Legacy: `"notifications": [{"uid": "statusHook"}]` - - Unified: Grafana automatically routes based on notification policy - - Contact points already created: "statusHook", "Teams Alert", etc. - -4. **Create for both environments** - - Copy to `alertrules/Staging/` - - Copy to `alertrules/Production/` - - Parameters auto-replaced during deployment - -5. **Remove from dashboard** - - Delete entire `"alert": {}` block from panel - - Keep `thresholds` array for visual indicators - -## šŸŽÆ Example Alert Structure - -```json -{ - "uid": "alert-name-kebab-case", - "title": "Alert Display Name", - "condition": "C", - "data": [ - { - "refId": "A", - "queryType": "Azure Log Analytics", - "azureLogAnalytics": { - "query": "KQL query here", - "resource": "[parameter(...)]" - }, - "datasourceUid": "F2XodEi7z", - "relativeTimeRange": { - "from": 300, - "to": 0 - } - }, - { - "refId": "B", - "queryType": "", - "datasourceUid": "-100", - "model": { - "expression": "A", - "reducer": "last", - "type": "reduce" - } - }, - { - "refId": "C", - "queryType": "", - "datasourceUid": "-100", - "model": { - "expression": "B", - "type": "threshold", - "conditions": [{ - "evaluator": {"params": [threshold], "type": "lt|gt"}, - "type": "query" - }] - } - } - ], - "noDataState": "KeepLast|OK|NoData|Alerting", - "execErrState": "KeepLast|Alerting", - "for": "5m", - "annotations": { - "description": "Alert message with @mentions" - }, - "labels": { - "NotificationId": "unique-id" - }, - "folderUID": "arcade-services", - "ruleGroup": "PCS Alerts", - "intervalSeconds": 60, - "isPaused": false -} -``` - -## šŸš€ Testing Alert Rules - -After provisioning: - -1. **Verify in Grafana UI**: - ``` - Navigate to: Alerting → Alert rules - Expected: See "PCS Work Item Success Rate alert", "PCS Exceptions High" - ``` - -2. **Check alert evaluation**: - ``` - Each alert should show: - - State: OK / Firing / Pending / NoData - - Last evaluation time - - Next evaluation time - ``` - -3. **Test notifications**: - ``` - - Wait for alert to fire naturally, OR - - Temporarily lower threshold to trigger alert - - Verify notification sent to contact point - ``` - -4. **View alert history**: - ``` - Navigate to: Alerting → Alert instances - See firing history and state changes - ``` - -## šŸ“ Notes - -- Contact points (statusHook, Teams Alert) already created and working -- Notification routing happens automatically via notification policies -- Alert rules are independent of dashboards -- Can have multiple alerts on same query -- Supports complex multi-condition logic via expression queries - -## āš ļø Current State - -**IMPORTANT**: Only 2 of 15+ alerts have been migrated so far. The remaining alerts need to be converted following the same pattern as the two examples. - -The SDK is ready - it will automatically pick up any new `.alert.json` files added to the `alertrules/Staging/` or `alertrules/Production/` directories. - -## šŸ”§ Quick Reference - -**Convert frequency to seconds**: -- "1m" → 60 -- "5m" → 300 -- "1h" → 3600 - -**State mapping**: -- `keep_state` → `KeepLast` -- `alerting` → `Alerting` -- `ok` → `OK` -- `no_data` → `NoData` - -**Condition operators**: -- `lt` = less than (<) -- `gt` = greater than (>) -- `within_range` = between two values -- `outside_range` = outside range - -**Reducer functions**: -- `last` = most recent value -- `avg` = average -- `min` = minimum -- `max` = maximum -- `sum` = sum diff --git a/src/Monitoring/Monitoring.ArcadeServices/datasource/Production/Deployment Annotations.datasource.json b/src/Monitoring/Monitoring.ArcadeServices/datasource/Production/Deployment Annotations.datasource.json deleted file mode 100644 index 393d88091..000000000 --- a/src/Monitoring/Monitoring.ArcadeServices/datasource/Production/Deployment Annotations.datasource.json +++ /dev/null @@ -1,25 +0,0 @@ -{ - "uid": "VrkJ-7W7z", - "type": "grafana-simple-json-datasource", - "typeLogoUrl": "", - "access": "proxy", - "url": "https://dotneteng-status.azurewebsites.net/api/annotations", - "password": "", - "user": "", - "database": "", - "basicAuth": false, - "basicAuthUser": "abcd", - "basicAuthPassword": "", - "withCredentials": false, - "isDefault": false, - "jsonData": { - "httpHeaderName1": "Authorization", - "tlsAuth": false, - "tlsSkipVerify": false - }, - "readOnly": false, - "secureJsonData": { - "basicAuthPassword": "", - "httpHeaderValue1": "[vault(dotneteng-status-auth-header)]" - } -} \ No newline at end of file diff --git a/src/Monitoring/Monitoring.ArcadeServices/datasource/Staging/Deployment Annotations.datasource.json b/src/Monitoring/Monitoring.ArcadeServices/datasource/Staging/Deployment Annotations.datasource.json deleted file mode 100644 index acfad94b2..000000000 --- a/src/Monitoring/Monitoring.ArcadeServices/datasource/Staging/Deployment Annotations.datasource.json +++ /dev/null @@ -1,25 +0,0 @@ -{ - "uid": "VrkJ-7W7z", - "type": "grafana-simple-json-datasource", - "typeLogoUrl": "", - "access": "proxy", - "url": "https://dotneteng-status-staging.azurewebsites.net/api/annotations", - "password": "", - "user": "", - "database": "", - "basicAuth": false, - "basicAuthUser": "abcd", - "basicAuthPassword": "", - "withCredentials": false, - "isDefault": false, - "jsonData": { - "httpHeaderName1": "Authorization", - "tlsAuth": false, - "tlsSkipVerify": false - }, - "readOnly": false, - "secureJsonData": { - "basicAuthPassword": "", - "httpHeaderValue1": "[vault(dotneteng-status-auth-header)]" - } -} diff --git a/src/Monitoring/Sdk/GrafanaClient.cs b/src/Monitoring/Sdk/GrafanaClient.cs index e9b8c44ae..74e2dcaaf 100644 --- a/src/Monitoring/Sdk/GrafanaClient.cs +++ b/src/Monitoring/Sdk/GrafanaClient.cs @@ -133,7 +133,6 @@ public async Task GetDataSourceByUidAsync(string uid) public async Task CreateFolderAsync(string uid, string title) { // First try to get the folder - if it exists, just return it - // This handles the built-in "general" folder which can't be updated var getUri = new Uri(new Uri(_baseUrl), $"/api/folders/{Uri.EscapeDataString(uid)}"); using (HttpResponseMessage getResponse = await _client.GetAsync(getUri).ConfigureAwait(false)) { diff --git a/src/SecretManager/Microsoft.DncEng.SecretManager/SecretTypes/AzureManagedGrafanaApiKey.cs b/src/SecretManager/Microsoft.DncEng.SecretManager/SecretTypes/AzureManagedGrafanaApiKey.cs deleted file mode 100644 index 04410f331..000000000 --- a/src/SecretManager/Microsoft.DncEng.SecretManager/SecretTypes/AzureManagedGrafanaApiKey.cs +++ /dev/null @@ -1,51 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Globalization; -using Microsoft.DncEng.CommandLineLib; - -namespace Microsoft.DncEng.SecretManager.SecretTypes; - -[Name("azure-managed-grafana-api-key")] -public class AzureManagedGrafanaApiKey : GenericAccessToken -{ - private readonly string[] _expirationDateFormats = new[] { "yyyy-MM-dd", "yyyy-MM-dd HH:mm:ss" }; - - protected override string HelpMessage => "Please login to https://{0} and navigate to Administration > Service accounts to create a new service account token."; - protected override string TokenName => "Azure Managed Grafana API key"; - protected override string TokenFormatDescription => "Service account token (starts with 'glsa_')"; - protected override string ExpirationFormatDescription => "format yyyy-MM-dd followed by optional time part hh:mm:ss or empty for no expiration"; - protected override bool HasExpiration => true; - - protected override IEnumerable> EnvironmentToHost => new[] - { - new KeyValuePair( "production", "https://dnceng-grafana-eraubnb4dkatgnfn.wus2.grafana.azure.com/" ), - new KeyValuePair( "staging", "https://dnceng-grafana-staging-faf3f3ebf0f8afbm.wus2.grafana.azure.com/" ) - }; - - public AzureManagedGrafanaApiKey(ISystemClock clock, IConsole console) : base(clock, console) - { - } - - protected override bool TryParseExpirationDate(string value, out DateTime parsedValue) - { - if (string.IsNullOrWhiteSpace(value)) - { - parsedValue = DateTime.MaxValue; - return true; - } - return DateTime.TryParseExact(value, _expirationDateFormats, CultureInfo.InvariantCulture, DateTimeStyles.None, out parsedValue); - } - - protected override bool ValidateToken(string token) - { - // Azure Managed Grafana service account tokens start with "glsa_" - if (token.StartsWith("glsa_", StringComparison.Ordinal)) - { - Console.WriteLine("Azure Managed Grafana service account token validated successfully."); - return true; - } - - Console.WriteLine("Invalid token format. Azure Managed Grafana tokens must start with 'glsa_'."); - return false; - } -}