diff --git a/.vault-config/dnceng-amg-int-kv.yaml b/.vault-config/dnceng-amg-int-kv.yaml new file mode 100644 index 000000000..8648a73b8 --- /dev/null +++ b/.vault-config/dnceng-amg-int-kv.yaml @@ -0,0 +1,24 @@ +storageLocation: + type: azure-key-vault + parameters: + subscription: a4fc5514-21a9-4296-bfaf-5c7ee7fa35d1 + name: dnceng-amg-int-kv + +secrets: + # API token for DotNet Status website + dotnet-build-bot-dotnet-eng-status-token: + type: text + parameters: + description: API token from https://dotneteng-status-staging.azurewebsites.net/ - Generated using dotnet-build-bot account + + # Authorization header for Deployment Annotations datasource + dotneteng-status-auth-header: + type: text + parameters: + description: "Bearer token for status API - Format: Bearer " + + # Teams webhook URL for alert notifications + fr-bot-notifications-teams-notification-url: + type: text + parameters: + description: Teams Incoming Webhook URL - Do not rotate \ No newline at end of file diff --git a/.vault-config/dnceng-amg-prod-kv.yaml b/.vault-config/dnceng-amg-prod-kv.yaml new file mode 100644 index 000000000..c20915d7d --- /dev/null +++ b/.vault-config/dnceng-amg-prod-kv.yaml @@ -0,0 +1,24 @@ +storageLocation: + type: azure-key-vault + parameters: + subscription: a4fc5514-21a9-4296-bfaf-5c7ee7fa35d1 + name: dnceng-amg-prod-kv + +secrets: + # API token for DotNet Status website + dotnet-build-bot-dotnet-eng-status-token: + type: text + parameters: + description: API token from https://dotneteng-status.azurewebsites.net/ - Generated using dotnet-build-bot account + + # Authorization header for Deployment Annotations datasource + dotneteng-status-auth-header: + type: text + parameters: + description: "Bearer token for status API - Format: Bearer " + + # Teams webhook URL for alert notifications + fr-bot-notifications-teams-notification-url: + type: text + parameters: + description: Teams Incoming Webhook URL - Do not rotate diff --git a/azure-pipelines-pr.yml b/azure-pipelines-pr.yml index 1730b3e64..9f74245af 100644 --- a/azure-pipelines-pr.yml +++ b/azure-pipelines-pr.yml @@ -105,4 +105,4 @@ stages: dotnet run --project src/SecretManager/Microsoft.DncEng.SecretManager -- validate-all -b src @manifestArgs displayName: Verify Secret Usages - - template: /eng/test.yaml \ No newline at end of file + - template: /eng/test.yaml diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 18819753d..f4aa6187f 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -7,6 +7,8 @@ variables: - name: _DotNetArtifactsCategory value: .NETCore - group: SDL_Settings + - name: ServiceConnectionName + value: 'Dotnet Engineering services' trigger: batch: true @@ -195,6 +197,24 @@ extends: contents: '*' targetFolder: $(Build.ArtifactStagingDirectory)\eng + - task: AzureCLI@2 + displayName: 'Validate Grafana Bicep Template' + inputs: + azureSubscription: '$(ServiceConnectionName)' + scriptType: 'ps' + scriptLocation: 'inlineScript' + inlineScript: | + Write-Host "Validating Grafana Bicep template..." + if (!(Test-Path "eng/deployment/azure-managed-grafana.bicep")) { + throw "Bicep template not found: azure-managed-grafana.bicep" + } + + az bicep build --file eng/deployment/azure-managed-grafana.bicep + if ($LASTEXITCODE -ne 0) { + throw "Bicep template validation failed" + } + Write-Host "SUCCESS: Bicep template validation successful" + - template: /eng/common/templates-official/post-build/post-build.yml@self parameters: enableSymbolValidation: false @@ -225,11 +245,14 @@ extends: PublishProfile: Int ServiceConnectionName: NetHelixStaging StatusVariableGroup: DotNetStatus Staging - GrafanaHost: https://dotnet-eng-grafana-staging.westus2.cloudapp.azure.com - GrafanaKeyVault: dotnet-grafana-staging - GrafanaVariableGroup: Dotnet-Grafana-Staging ServiceConnectionClientId: 57f299da-15de-4117-b8f6-7c10451926f0 ServiceConnectionId: 7829de7e-fb4e-4118-8370-475d6bc61905 + AMGServiceConnectionName: 'Dotnet Engineering services' + AMGServiceConnectionId: dd8c2cfc-b9c9-452c-a168-ccd4240ada55 + AMGServiceConnectionClientId: fc1eb341-aea4-4a11-8f80-d14b8775b2ba + AMGDeploymentEnvironment: Staging + AMGGrafanaWorkspaceName: dnceng-grafana-staging + AMGGrafanaKeyVault: dnceng-amg-int-kv ${{ else }}: DeploymentEnvironment: Production DotNetStatusAppName: dotneteng-status @@ -237,8 +260,11 @@ extends: PublishProfile: Prod ServiceConnectionName: NetHelix StatusVariableGroup: DotNetStatus Production - GrafanaHost: https://dotnet-eng-grafana.westus2.cloudapp.azure.com - GrafanaKeyVault: dotnet-grafana - GrafanaVariableGroup: Dotnet-Grafana-Production ServiceConnectionClientId: fc1eb341-aea4-4a11-8f80-d14b8775b2ba - ServiceConnectionId: 4a511f6f-b538-48e6-a389-207e430634d1 \ No newline at end of file + ServiceConnectionId: 4a511f6f-b538-48e6-a389-207e430634d1 + AMGServiceConnectionName: 'Dotnet Engineering services' + AMGServiceConnectionId: dd8c2cfc-b9c9-452c-a168-ccd4240ada55 + AMGServiceConnectionClientId: fc1eb341-aea4-4a11-8f80-d14b8775b2ba + AMGDeploymentEnvironment: Production + AMGGrafanaWorkspaceName: dnceng-grafana + AMGGrafanaKeyVault: dnceng-amg-prod-kv diff --git a/eng/deploy-managed-grafana.yml b/eng/deploy-managed-grafana.yml new file mode 100644 index 000000000..de9e2368d --- /dev/null +++ b/eng/deploy-managed-grafana.yml @@ -0,0 +1,208 @@ +parameters: +- name: AMGServiceConnectionName + type: string +- name: AMGServiceConnectionClientId + type: string +- name: AMGServiceConnectionId + type: string +- name: AMGDeploymentEnvironment + type: string +- name: AMGGrafanaWorkspaceName + type: string +- name: AMGGrafanaKeyVault + type: string + +stages: +- stage: DeployGrafana + displayName: 'Deploy Grafana Infrastructure and Dashboards' + pool: + name: NetCore1ESPool-Internal-NoMSI + demands: ImageOverride -equals 1es-windows-2019 + dependsOn: + - predeploy + - approval + jobs: + - template: /eng/provision-grafana.yaml@self + parameters: + DeploymentEnvironment: ${{ parameters.AMGDeploymentEnvironment }} + ServiceConnectionName: ${{ parameters.AMGServiceConnectionName }} + GrafanaResourceGroup: 'monitoring-managed' + GrafanaWorkspaceName: ${{ parameters.AMGGrafanaWorkspaceName }} + GrafanaLocation: 'westus2' + GrafanaKeyVault: ${{ parameters.AMGGrafanaKeyVault }} + - job: SetupToken + dependsOn: ProvisionGrafana + displayName: 'Setup Grafana API Token' + variables: + GrafanaEndpoint: $[ dependencies.ProvisionGrafana.outputs['ExportGrafanaInfo.GrafanaEndpoint'] ] + pool: + name: NetCore1ESPool-Internal + demands: ImageOverride -equals 1es-windows-2022 + steps: + - task: AzureCLI@2 + displayName: 'Grant Pipeline Service Principal Grafana Admin Role' + inputs: + azureSubscription: ${{ parameters.AMGServiceConnectionName }} + scriptType: 'pscore' + scriptLocation: 'inlineScript' + inlineScript: | + Write-Host "Granting pipeline service principal Grafana Admin role..." + + $workspaceName = "${{ parameters.AMGGrafanaWorkspaceName }}" + $rgName = "monitoring-managed" + + # Get the current service principal object ID + $spObjectId = az account show --query "user.name" --output tsv + Write-Host "Service Principal Object ID: $spObjectId" + + # Get the Grafana workspace resource ID + $grafanaId = az grafana show --name $workspaceName --resource-group $rgName --query "id" --output tsv + Write-Host "Grafana Workspace: $workspaceName" + Write-Host "Grafana ID: $grafanaId" + + # Check if role assignment already exists + $existingAssignment = az role assignment list ` + --assignee $spObjectId ` + --scope $grafanaId ` + --role "Grafana Admin" ` + --query "[0].id" ` + --output tsv + + if ($existingAssignment) { + Write-Host "✓ Pipeline service principal already has Grafana Admin role" + } else { + Write-Host "Granting Grafana Admin role..." + az role assignment create ` + --role "Grafana Admin" ` + --assignee $spObjectId ` + --scope $grafanaId ` + --output none + + if ($LASTEXITCODE -eq 0) { + Write-Host "✓ Pipeline service principal granted Grafana Admin role" + Write-Host "⏱ Waiting 15 seconds for role assignment to propagate..." + Start-Sleep -Seconds 15 + } else { + Write-Error "Failed to grant Grafana Admin role" + exit 1 + } + } + + - task: AzureCLI@2 + displayName: 'Create or Validate Grafana API Token' + inputs: + azureSubscription: ${{ parameters.AMGServiceConnectionName }} + scriptType: 'pscore' + scriptLocation: 'scriptPath' + scriptPath: 'eng/setup-grafana-api-token.ps1' + arguments: >- + -Environment "${{ parameters.AMGDeploymentEnvironment }}" + -KeyVaultName "${{ parameters.AMGGrafanaKeyVault }}" + + - job: PublishDashboards + displayName: 'Publish Dashboards to Azure Managed Grafana' + dependsOn: + - ProvisionGrafana + - SetupToken + pool: + name: NetCore1ESPool-Internal + demands: ImageOverride -equals 1es-windows-2022 + variables: + GrafanaEndpoint: $[ dependencies.ProvisionGrafana.outputs['ExportGrafanaInfo.GrafanaEndpoint'] ] + System.AccessToken: $(System.AccessToken) + steps: + - task: UseDotNet@2 + displayName: 'Install Correct .NET Version' + inputs: + useGlobalJson: true + + - script: dotnet publish --configuration Release $(Build.SourcesDirectory)\src\Monitoring\Sdk\Microsoft.DotNet.Monitoring.Sdk.csproj -f net8.0 + displayName: 'Build Monitoring SDK' + + - task: AzureCLI@2 + displayName: 'Publish Grafana Dashboards' + inputs: + azureSubscription: ${{ parameters.AMGServiceConnectionName }} + scriptType: 'pscore' + scriptLocation: 'inlineScript' + addSpnToEnvironment: true + inlineScript: | + Write-Host "==========================================" + Write-Host "Publishing Dashboards to Azure Managed Grafana" + Write-Host "==========================================" + Write-Host "Grafana Endpoint: $(GrafanaEndpoint)" + Write-Host "Environment: ${{ parameters.AMGDeploymentEnvironment }}" + Write-Host "" + + # Get the API token from Key Vault with retry logic for RBAC propagation + $tokenSecretName = "grafana-admin-api-key" + Write-Host "Retrieving API token from Key Vault..." + + $apiToken = $null + $maxRetries = 5 + $retryCount = 0 + $waitSeconds = 60 + + while (-not $apiToken -and $retryCount -lt $maxRetries) { + try { + $apiToken = az keyvault secret show --vault-name "${{ parameters.AMGGrafanaKeyVault }}" --name $tokenSecretName --query "value" --output tsv 2>&1 + + if ($LASTEXITCODE -eq 0 -and $apiToken -and $apiToken.Trim()) { + Write-Host "✓ API token retrieved successfully from Key Vault" + break + } else { + $apiToken = $null + throw "Failed to retrieve token" + } + } catch { + $retryCount++ + if ($retryCount -lt $maxRetries) { + Write-Host "⏱ Waiting for Key Vault access (attempt $retryCount/$maxRetries, waiting $waitSeconds seconds)..." + Start-Sleep -Seconds $waitSeconds + } else { + Write-Error "Unable to retrieve API token after $maxRetries attempts ($($maxRetries * $waitSeconds) seconds total)" + Write-Error "Secret name: $tokenSecretName" + Write-Error "Key Vault: ${{ parameters.AMGGrafanaKeyVault }}" + Write-Error "" + Write-Error "Possible causes:" + Write-Error "1. RBAC permissions haven't propagated yet (can take 5-10 minutes)" + Write-Error "2. The SetupToken job failed to create the token" + Write-Error "3. The pipeline service principal doesn't have Key Vault Secrets Officer role" + Write-Error "" + exit 1 + } + } + } + + Write-Host "" + Write-Host "Publishing dashboards using MSBuild SDK..." + Write-Host "" + + # Publish using the same MSBuild SDK as self-hosted Grafana + dotnet build $(Build.SourcesDirectory)\src\Monitoring\Monitoring.ArcadeServices\Monitoring.ArcadeServices.proj ` + --configuration Release ` + -t:PublishGrafana ` + -p:GrafanaAccessToken=$apiToken ` + -p:GrafanaHost="$(GrafanaEndpoint)" ` + -p:GrafanaKeyVaultName="${{ parameters.AMGGrafanaKeyVault }}" ` + -p:GrafanaEnvironment="${{ parameters.AMGDeploymentEnvironment }}" ` + -p:ParametersFile=parameters.json ` + -p:ClientId="${{ parameters.AMGServiceConnectionClientId }}" ` + -p:ServiceConnectionId="${{ parameters.AMGServiceConnectionId }}" ` + -p:SystemAccessToken="$(System.AccessToken)" ` + -v:normal + + if ($LASTEXITCODE -ne 0) { + Write-Error "Failed to publish dashboards to Grafana" + exit 1 + } + + Write-Host "" + Write-Host "==========================================" + Write-Host "✓ SUCCESS! Dashboards Published" + Write-Host "==========================================" + Write-Host "" + Write-Host "View your dashboards at:" + Write-Host "$(GrafanaEndpoint)/dashboards" + Write-Host "" + diff --git a/eng/deploy.yaml b/eng/deploy.yaml index f44ef9b6f..5e3976839 100644 --- a/eng/deploy.yaml +++ b/eng/deploy.yaml @@ -15,11 +15,17 @@ parameters: type: string - name: StatusVariableGroup type: string -- name: GrafanaHost +- name: AMGServiceConnectionName type: string -- name: GrafanaKeyVault +- name: AMGServiceConnectionClientId type: string -- name: GrafanaVariableGroup +- name: AMGServiceConnectionId + type: string +- name: AMGDeploymentEnvironment + type: string +- name: AMGGrafanaWorkspaceName + type: string +- name: AMGGrafanaKeyVault type: string # --- Secret Variable group requirements --- @@ -149,6 +155,15 @@ stages: DeploymentType: zipDeploy RemoveAdditionalFilesFlag: true +- template: /eng/deploy-managed-grafana.yml@self + parameters: + AMGServiceConnectionName: ${{ parameters.AMGServiceConnectionName }} + AMGServiceConnectionId: ${{ parameters.AMGServiceConnectionId }} + AMGServiceConnectionClientId: ${{ parameters.AMGServiceConnectionClientId }} + AMGDeploymentEnvironment: ${{ parameters.AMGDeploymentEnvironment }} + AMGGrafanaWorkspaceName: ${{ parameters.AMGGrafanaWorkspaceName }} + AMGGrafanaKeyVault: ${{ parameters.AMGGrafanaKeyVault }} + - stage: postdeploy displayName: Post-Deployment pool: @@ -156,19 +171,9 @@ stages: demands: ImageOverride -equals 1es-windows-2022 dependsOn: - deploy + - DeployGrafana variables: - group: ${{ parameters.StatusVariableGroup }} - - group: ${{ parameters.GrafanaVariableGroup }} - - name: DeploymentEnvironment - value: ${{ parameters.DeploymentEnvironment }} - - name: GrafanaHost - value: ${{ parameters.GrafanaHost }} - - name: GrafanaKeyVault - value: ${{ parameters.GrafanaKeyVault }} - - name: GrafanaClientId - value: ${{ parameters.ServiceConnectionClientId }} - - name: GrafanaServiceConnectionId - value: ${{ parameters.ServiceConnectionId }} jobs: - job: notifyEndDeployment displayName: Notify deployment end @@ -182,17 +187,6 @@ stages: serviceConnection: ${{ parameters.DotNetStatusEndpoint }} method: POST urlSuffix: /dnceng/$(Build.BuildNumber)/end - - job: updateMetrics - displayName: Update Grafana Metrics - steps: - - task: UseDotNet@2 - displayName: Install Correct .NET Version - inputs: - useGlobalJson: true - - script: dotnet publish --configuration Release $(Build.SourcesDirectory)\src\Monitoring\Sdk\Microsoft.DotNet.Monitoring.Sdk.csproj -f net8.0 - displayName: Build Monitoring SDK - - script: dotnet build $(Build.SourcesDirectory)\src\Monitoring\Monitoring.ArcadeServices\Monitoring.ArcadeServices.proj --configuration Release -t:PublishGrafana -p:GrafanaAccessToken=$(grafana-admin-api-key) -p:GrafanaHost=$(GrafanaHost) -p:GrafanaKeyVaultName=$(GrafanaKeyVault) -p:ClientId=$(GrafanaClientId) -p:ServiceConnectionId=$(GrafanaServiceConnectionId) -p:SystemAccessToken=$(System.AccessToken) -p:GrafanaEnvironment=$(DeploymentEnvironment) -p:ParametersFile=parameters.json -v:normal - displayName: Publish Grafana Dashboards - stage: validateDeployment displayName: Validate deployment @@ -201,6 +195,7 @@ stages: demands: ImageOverride -equals 1es-windows-2022 dependsOn: - deploy + - DeployGrafana jobs: - job: scenario displayName: Scenario tests diff --git a/eng/deployment/azure-managed-grafana.bicep b/eng/deployment/azure-managed-grafana.bicep new file mode 100644 index 000000000..f1cef2e65 --- /dev/null +++ b/eng/deployment/azure-managed-grafana.bicep @@ -0,0 +1,199 @@ +// Azure Managed Grafana Workspace Bicep Template +@description('The Azure region where the Grafana workspace will be deployed') +param location string + +@description('The name of the Grafana workspace') +param grafanaWorkspaceName string + +@description('The pricing tier for the Grafana workspace') +@allowed([ + 'Standard' + 'Essential' +]) +param skuName string = 'Standard' + +@description('The pricing tier for the Grafana key vault') +@allowed([ + 'standard' + 'premium' +]) +param kvSkuName string = 'standard' + +@description('The key vault sku family') +@allowed([ + 'A' + 'premium' +]) +param kvSkuFamily string = 'A' + +@description('The deployment environment (Staging or Production)') +param environment string + +@description('The name of the Key Vault for Grafana secrets') +param keyVaultName string + +@description('The tenant ID for Azure AD') +param tenantId string = tenant().tenantId + +@description('The Azure AD Object ID of the .NET Engineering Services group') +param dotnetEngServicesGroupId string = '65d7fc1d-2744-4669-8779-5cd7d7a6b95b' + +// User-assigned managed identity for Grafana +resource grafanaUserAssignedIdentity 'Microsoft.ManagedIdentity/userAssignedIdentities@2023-01-31' = { + name: environment == 'Production' ? 'dnceng-managed-grafana' : 'dnceng-managed-grafana-staging' + location: location + tags: { + Environment: environment + Purpose: 'Azure Managed Grafana' + Service: 'DncEng' + } +} + +// Azure Key Vault for Grafana secrets +resource grafanaKeyVault 'Microsoft.KeyVault/vaults@2023-07-01' = { + name: keyVaultName + location: location + tags: { + Environment: environment + Purpose: 'Azure Managed Grafana Secrets' + Service: 'DncEng' + } + properties: { + sku: { + family: kvSkuFamily + name: kvSkuName + } + tenantId: tenantId + enabledForDeployment: false + enabledForDiskEncryption: false + enabledForTemplateDeployment: true + enableSoftDelete: true + softDeleteRetentionInDays: 90 + enableRbacAuthorization: true + enablePurgeProtection: true + publicNetworkAccess: 'Enabled' + networkAcls: { + bypass: 'AzureServices' + defaultAction: 'Allow' + } + } +} + +// Define Key Vault role IDs +var keyVaultSecretsOfficerRoleId = 'b86a8fe4-44ce-4948-aee5-eccb2c155cd7' +var keyVaultCertificatesOfficerRoleId = 'a4417e6f-fecd-4de8-b567-7b0420556985' +var readerRoleId = 'acdd72a7-3385-48ef-bd42-f606fba81ae7' +var keyVaultCertificateUserRoleId = 'db79e9a7-68ee-4b58-9aeb-b90e7c24fcba' +var keyVaultCryptoUserRoleId = '12338af0-0e69-4776-bea7-57ae8d297424' +var keyVaultSecretsUserRoleId = '4633458b-17de-408a-b874-0445c86b69e6' + +// Define Grafana Admin role ID +var grafanaAdminRoleId = '22926164-76b3-42b3-bc55-97df8dab3e41' + +resource grafanaKeyVaultSecretsOfficerRole 'Microsoft.Authorization/roleAssignments@2022-04-01' = { + name: guid(grafanaKeyVault.id, grafanaUserAssignedIdentity.id, keyVaultSecretsOfficerRoleId) + scope: grafanaKeyVault + properties: { + roleDefinitionId: subscriptionResourceId('Microsoft.Authorization/roleDefinitions', keyVaultSecretsOfficerRoleId) + principalId: grafanaUserAssignedIdentity.properties.principalId + principalType: 'ServicePrincipal' + } +} + +// Grant Reader role to Grafana managed identity +resource grafanaKeyVaultReaderRole 'Microsoft.Authorization/roleAssignments@2022-04-01' = { + name: guid(grafanaKeyVault.id, grafanaUserAssignedIdentity.id, readerRoleId) + scope: grafanaKeyVault + properties: { + roleDefinitionId: subscriptionResourceId('Microsoft.Authorization/roleDefinitions', readerRoleId) + principalId: grafanaUserAssignedIdentity.properties.principalId + principalType: 'ServicePrincipal' + } +} + +// Grant Key Vault Certificate User role to Grafana managed identity +resource grafanaKeyVaultCertificateUserRole 'Microsoft.Authorization/roleAssignments@2022-04-01' = { + name: guid(grafanaKeyVault.id, grafanaUserAssignedIdentity.id, keyVaultCertificateUserRoleId) + scope: grafanaKeyVault + properties: { + roleDefinitionId: subscriptionResourceId('Microsoft.Authorization/roleDefinitions', keyVaultCertificateUserRoleId) + principalId: grafanaUserAssignedIdentity.properties.principalId + principalType: 'ServicePrincipal' + } +} + +// Grant Key Vault Certificates Officer role to Grafana managed identity +resource grafanaKeyVaultCertificatesOfficerRole 'Microsoft.Authorization/roleAssignments@2022-04-01' = { + name: guid(grafanaKeyVault.id, grafanaUserAssignedIdentity.id, keyVaultCertificatesOfficerRoleId) + scope: grafanaKeyVault + properties: { + roleDefinitionId: subscriptionResourceId('Microsoft.Authorization/roleDefinitions', keyVaultCertificatesOfficerRoleId) + principalId: grafanaUserAssignedIdentity.properties.principalId + principalType: 'ServicePrincipal' + } +} + +// Grant Key Vault Crypto User role to Grafana managed identity +resource grafanaKeyVaultCryptoUserRole 'Microsoft.Authorization/roleAssignments@2022-04-01' = { + name: guid(grafanaKeyVault.id, grafanaUserAssignedIdentity.id, keyVaultCryptoUserRoleId) + scope: grafanaKeyVault + properties: { + roleDefinitionId: subscriptionResourceId('Microsoft.Authorization/roleDefinitions', keyVaultCryptoUserRoleId) + principalId: grafanaUserAssignedIdentity.properties.principalId + principalType: 'ServicePrincipal' + } +} + +// Grant Key Vault Secrets User role to Grafana managed identity +resource grafanaKeyVaultSecretsUserRole 'Microsoft.Authorization/roleAssignments@2022-04-01' = { + name: guid(grafanaKeyVault.id, grafanaUserAssignedIdentity.id, keyVaultSecretsUserRoleId) + scope: grafanaKeyVault + properties: { + roleDefinitionId: subscriptionResourceId('Microsoft.Authorization/roleDefinitions', keyVaultSecretsUserRoleId) + principalId: grafanaUserAssignedIdentity.properties.principalId + principalType: 'ServicePrincipal' + } +} + +// Azure Managed Grafana Workspace +resource grafanaWorkspace 'Microsoft.Dashboard/grafana@2023-09-01' = { + name: grafanaWorkspaceName + location: location + sku: { + name: skuName + } + identity: { + type: 'UserAssigned' + userAssignedIdentities: { + '${grafanaUserAssignedIdentity.id}': {} + } + } + properties: { + deterministicOutboundIP: 'Enabled' + apiKey: 'Enabled' + autoGeneratedDomainNameLabelScope: 'TenantReuse' + zoneRedundancy: 'Disabled' + publicNetworkAccess: 'Enabled' + grafanaIntegrations: { + azureMonitorWorkspaceIntegrations: [] + } + } +} + +// Grant Grafana Admin role to .NET Engineering Services group +resource dotnetEngServicesGrafanaAdminRole 'Microsoft.Authorization/roleAssignments@2022-04-01' = { + name: guid(grafanaWorkspace.id, dotnetEngServicesGroupId, grafanaAdminRoleId) + scope: grafanaWorkspace + properties: { + roleDefinitionId: subscriptionResourceId('Microsoft.Authorization/roleDefinitions', grafanaAdminRoleId) + principalId: dotnetEngServicesGroupId + principalType: 'Group' + } +} + +// Output the Grafana workspace details +output grafanaWorkspaceUrl string = grafanaWorkspace.properties.endpoint +output grafanaUserAssignedIdentityId string = grafanaUserAssignedIdentity.id + +// Output Key Vault details +output keyVaultName string = grafanaKeyVault.name diff --git a/eng/provision-grafana.yaml b/eng/provision-grafana.yaml new file mode 100644 index 000000000..122422601 --- /dev/null +++ b/eng/provision-grafana.yaml @@ -0,0 +1,383 @@ +# Azure Managed Grafana Provisioning Template +# This template provisions Azure Managed Grafana workspaces as part of the deployment process + +parameters: +- name: DeploymentEnvironment + type: string + +- name: ServiceConnectionName + type: string + +- name: GrafanaResourceGroup + type: string + +- name: GrafanaWorkspaceName + type: string + +- name: GrafanaLocation + type: string + +- name: GrafanaSkuName + type: string + default: 'Standard' + +- name: GrafanaKeyVault + type: string + +jobs: +- job: ProvisionGrafana + displayName: 'Provision Azure Managed Grafana' + pool: + name: NetCore1ESPool-Internal + demands: ImageOverride -equals 1es-windows-2022 + + steps: + - checkout: self + displayName: 'Checkout Repository' + + - task: AzureCLI@2 + displayName: 'Validate Bicep Template' + inputs: + azureSubscription: '${{ parameters.ServiceConnectionName }}' + scriptType: 'ps' + scriptLocation: 'inlineScript' + inlineScript: | + Write-Host "Validating Grafana Bicep template..." + if (!(Test-Path "eng/deployment/azure-managed-grafana.bicep")) { + throw "Bicep template not found: azure-managed-grafana.bicep" + } + + az bicep build --file eng/deployment/azure-managed-grafana.bicep + if ($LASTEXITCODE -ne 0) { + throw "Bicep template validation failed" + } + Write-Host "SUCCESS: Bicep template validation successful" + + - task: AzureCLI@2 + displayName: 'Ensure Resource Group Exists' + inputs: + azureSubscription: '${{ parameters.ServiceConnectionName }}' + scriptType: 'ps' + scriptLocation: 'inlineScript' + inlineScript: | + $ErrorActionPreference = 'Continue' + $rgName = "${{ parameters.GrafanaResourceGroup }}" + $location = "${{ parameters.GrafanaLocation }}" + + Write-Host "Checking if resource group '$rgName' exists..." + + # Check if resource group exists + $exists = az group exists --name $rgName + + if ($exists -eq 'false') { + Write-Host "Resource group does not exist. Creating resource group '$rgName' in '$location'..." + az group create --name $rgName --location $location --output none + if ($LASTEXITCODE -ne 0) { + throw "Failed to create resource group '$rgName'" + } + Write-Host "SUCCESS: Resource group created successfully" + } else { + Write-Host "SUCCESS: Resource group '$rgName' already exists" + } + + - task: AzureResourceManagerTemplateDeployment@3 + displayName: 'Deploy Grafana Workspace' + name: DeployGrafana + inputs: + deploymentScope: 'Resource Group' + azureResourceManagerConnection: '${{ parameters.ServiceConnectionName }}' + action: 'Create Or Update Resource Group' + resourceGroupName: '${{ parameters.GrafanaResourceGroup }}' + location: '${{ parameters.GrafanaLocation }}' + templateLocation: 'Linked artifact' + csmFile: 'eng/deployment/azure-managed-grafana.bicep' + overrideParameters: '-location "${{ parameters.GrafanaLocation }}" -grafanaWorkspaceName "${{ parameters.GrafanaWorkspaceName }}" -skuName "${{ parameters.GrafanaSkuName }}" -environment "${{ parameters.DeploymentEnvironment }}" -keyVaultName "${{ parameters.GrafanaKeyVault }}"' + deploymentMode: 'Incremental' + deploymentName: 'grafana-${{ parameters.DeploymentEnvironment }}-$(Build.BuildNumber)' + deploymentOutputs: 'grafanaOutputs' + + - task: PowerShell@2 + displayName: 'Export Grafana Identity' + name: ExportIdentity + inputs: + targetType: 'inline' + script: | + $outputs = '$(grafanaOutputs)' | ConvertFrom-Json + $identityId = $outputs.grafanaUserAssignedIdentityId.value + Write-Host "Grafana User-Assigned Identity ID: $identityId" + Write-Host "##vso[task.setvariable variable=GrafanaIdentityId;isOutput=true]$identityId" + + - task: PowerShell@2 + displayName: 'Export Grafana Endpoint and Key Vault for Dashboard Publishing' + name: ExportGrafanaInfo + inputs: + targetType: 'inline' + script: | + $outputs = '$(grafanaOutputs)' | ConvertFrom-Json + $endpoint = $outputs.grafanaWorkspaceUrl.value + $keyVaultName = $outputs.keyVaultName.value + + Write-Host "Grafana Endpoint: $endpoint" + Write-Host "Key Vault Name: $keyVaultName" + + Write-Host "##vso[task.setvariable variable=GrafanaEndpoint;isOutput=true]$endpoint" + Write-Host "##vso[task.setvariable variable=KeyVaultName;isOutput=true]$keyVaultName" + + - task: AzureCLI@2 + displayName: 'Grant Pipeline Service Principal Key Vault Secrets Access' + inputs: + azureSubscription: '${{ parameters.ServiceConnectionName }}' + scriptType: 'pscore' + scriptLocation: 'inlineScript' + inlineScript: | + Write-Host "Granting pipeline service principal Key Vault Secrets Officer role..." + + $kvName = "${{ parameters.GrafanaKeyVault }}" + $rgName = "${{ parameters.GrafanaResourceGroup }}" + + # Get the current service principal object ID + $spObjectId = az account show --query "user.name" --output tsv + Write-Host "Service Principal Object ID: $spObjectId" + + # Get the Key Vault resource ID + $kvId = az keyvault show --name $kvName --resource-group $rgName --query "id" --output tsv + Write-Host "Key Vault: $kvName" + Write-Host "Key Vault ID: $kvId" + + # Check if role assignment already exists + $existingAssignment = az role assignment list ` + --assignee $spObjectId ` + --scope $kvId ` + --role "Key Vault Secrets Officer" ` + --query "[0].id" ` + --output tsv + + if ($existingAssignment) { + Write-Host "✓ Pipeline service principal already has Key Vault Secrets Officer role" + } else { + Write-Host "Granting Key Vault Secrets Officer role..." + az role assignment create ` + --role "Key Vault Secrets Officer" ` + --assignee $spObjectId ` + --scope $kvId ` + --output none + + if ($LASTEXITCODE -eq 0) { + Write-Host "✓ Pipeline service principal granted Key Vault Secrets Officer role" + } else { + Write-Error "Failed to grant Key Vault Secrets Officer role" + exit 1 + } + } + + Write-Host "" + Write-Host "ℹ️ Note: Azure RBAC permissions can take 5-10 minutes to propagate to Key Vault data plane" + Write-Host "ℹ️ The PublishDashboards stage has retry logic to handle propagation delays" + + - task: AzureCLI@2 + displayName: 'Grant Grafana Identity Monitoring Reader Access' + inputs: + azureSubscription: '${{ parameters.ServiceConnectionName }}' + scriptType: 'pscore' + scriptLocation: 'inlineScript' + inlineScript: | + Write-Host "==========================================" + Write-Host "Granting Monitoring Reader to Grafana Identity" + Write-Host "==========================================" + Write-Host "" + + $workspaceName = "${{ parameters.GrafanaWorkspaceName }}" + $rgName = "${{ parameters.GrafanaResourceGroup }}" + $environment = "${{ parameters.DeploymentEnvironment }}" + + # Get the user-assigned managed identity (name matches Bicep template) + $managedIdentityName = if ($environment -eq 'Production') { 'dnceng-managed-grafana' } else { 'dnceng-managed-grafana-staging' } + Write-Host "Retrieving managed identity: $managedIdentityName" + + $identity = az identity show --name $managedIdentityName --resource-group $rgName --query '{principalId:principalId, clientId:clientId}' --output json | ConvertFrom-Json + + if (-not $identity) { + Write-Error "Failed to retrieve managed identity: $managedIdentityName" + exit 1 + } + + $principalId = $identity.principalId + $clientId = $identity.clientId + + Write-Host "✓ Managed Identity: $managedIdentityName" + Write-Host "✓ Principal ID: $principalId" + Write-Host "✓ Client ID: $clientId" + Write-Host "" + + # Define common subscriptions shared across all environments + $commonSubscriptions = @( + @{name="HelixStaging"; id="cab65fc3-d077-467d-931f-3932eabf36d3"}, + @{name="dnceng-internaltooling"; id="84a65c9a-787d-45da-b10a-3a1cefce8060"}, + @{name="Dotnet Engineering services"; id="a4fc5514-21a9-4296-bfaf-5c7ee7fa35d1"}, + @{name="Helix"; id="68672ab8-de0c-40f1-8d1b-ffb20bd62c0f"} + ) + + # Add environment-specific subscriptions + $subscriptions = @() + if ($environment -eq "Staging") { + $subscriptions += @{name=".NET Product Construction Services - Staging"; id="e6b5f9f5-0ca4-4351-879b-014d78400ec2"} + } else { + $subscriptions += @{name=".NET Product Construction Services"; id="fbd6122a-9ad3-42e4-976e-bccb82486856"} + } + $subscriptions += $commonSubscriptions + + Write-Host "Granting Monitoring Reader role on $($subscriptions.Count) subscriptions..." + Write-Host "" + + $monitoringReaderRoleId = "43d0d8ad-25c7-4714-9337-8ba259a9fe05" + $successCount = 0 + $failCount = 0 + + foreach ($sub in $subscriptions) { + Write-Host "Processing: $($sub.name) ($($sub.id))" + + # Check if role assignment already exists + $existingAssignment = az role assignment list ` + --assignee $principalId ` + --role "Monitoring Reader" ` + --scope "/subscriptions/$($sub.id)" ` + --query "[0].id" ` + --output tsv 2>$null + + if ($existingAssignment) { + Write-Host " ✓ Role assignment already exists" + $successCount++ + } else { + Write-Host " Creating role assignment..." + + $result = az role assignment create ` + --role "Monitoring Reader" ` + --assignee-object-id $principalId ` + --assignee-principal-type ServicePrincipal ` + --scope "/subscriptions/$($sub.id)" ` + --output none 2>&1 + + if ($LASTEXITCODE -eq 0) { + Write-Host " ✓ Role assignment created successfully" + $successCount++ + } else { + Write-Warning " ⚠ Failed to create role assignment" + Write-Warning " Error: $result" + $failCount++ + } + } + Write-Host "" + } + + Write-Host "==========================================" + Write-Host "Role Assignment Summary" + Write-Host "==========================================" + Write-Host "✓ Successful: $successCount / $($subscriptions.Count)" + if ($failCount -gt 0) { + Write-Host "⚠ Failed: $failCount / $($subscriptions.Count)" + Write-Host "" + Write-Host "Note: Some failures may be due to lack of permissions on target subscriptions." + Write-Host "These role assignments may need to be granted manually by subscription owners." + } + Write-Host "" + Write-Host "ℹ️ RBAC propagation can take 2-5 minutes for Azure Monitor queries to work" + Write-Host "" + + - task: AzureCLI@2 + displayName: 'Install Azure Managed Grafana Extension' + inputs: + azureSubscription: '${{ parameters.ServiceConnectionName }}' + scriptType: 'ps' + scriptLocation: 'inlineScript' + inlineScript: | + Write-Host "Installing Azure CLI Azure Managed Grafana extension..." + az extension add --name amg + if ($LASTEXITCODE -ne 0) { + Write-Host "Warning: Failed to install amg extension, will use alternative verification method" + } else { + Write-Host "SUCCESS: Azure Managed Grafana extension installed" + } + + - task: AzureCLI@2 + displayName: 'Verify Grafana Deployment' + inputs: + azureSubscription: '${{ parameters.ServiceConnectionName }}' + scriptType: 'ps' + scriptLocation: 'inlineScript' + inlineScript: | + $workspaceName = "${{ parameters.GrafanaWorkspaceName }}" + $rgName = "${{ parameters.GrafanaResourceGroup }}" + + Write-Host "Verifying Grafana workspace deployment..." + + # Wait for deployment to complete + $maxAttempts = 5 + $attempt = 0 + do { + $attempt++ + Write-Host "Verification attempt $attempt of $maxAttempts..." + + $workspace = az grafana show --name $workspaceName --resource-group $rgName 2>$null | ConvertFrom-Json + if ($workspace -and $workspace.properties.provisioningState -eq "Succeeded") { + break + } + + if ($attempt -lt $maxAttempts) { + Write-Host "Workspace not ready yet, waiting 30 seconds..." + Start-Sleep -Seconds 30 + } + } while ($attempt -lt $maxAttempts) + + if (!$workspace) { + throw "Failed to verify Grafana workspace deployment" + } + + Write-Host "GRAFANA WORKSPACE DETAILS:" + Write-Host " Name: $($workspace.name)" + Write-Host " URL: $($workspace.properties.endpoint)" + Write-Host " Location: $($workspace.location)" + Write-Host " SKU: $($workspace.sku.name)" + Write-Host " Status: $($workspace.properties.provisioningState)" + + # Display user-assigned identity details + if ($workspace.identity.type -eq "UserAssigned") { + $userIdentities = $workspace.identity.userAssignedIdentities + if ($userIdentities) { + Write-Host " User-Assigned Identities:" + $userIdentities.PSObject.Properties | ForEach-Object { + $identityId = $_.Name + $identityName = $identityId.Split('/')[-1] + Write-Host " Name: $identityName" + Write-Host " Resource ID: $identityId" + } + } + } else { + Write-Host " Principal ID: $($workspace.identity.principalId)" + } + + # Verify role assignments + Write-Host "Checking role assignments..." + $roleAssignments = az role assignment list --scope $workspace.id --query '[].{principalId:principalId, roleDefinitionName:roleDefinitionName}' 2>$null | ConvertFrom-Json + if ($roleAssignments) { + $roleAssignments | ForEach-Object { + Write-Host " Role: $($_.roleDefinitionName) - Principal: $($_.principalId)" + } + } else { + Write-Host " No role assignments found" + } + + # Verify Key Vault + $kvName = "${{ parameters.GrafanaKeyVault }}" + Write-Host "" + Write-Host "KEY VAULT DETAILS:" + $keyVault = az keyvault show --name $kvName --resource-group $rgName --query '{name:name, vaultUri:properties.vaultUri, sku:properties.sku.name}' -o json 2>$null | ConvertFrom-Json + if ($keyVault) { + Write-Host " Name: $($keyVault.name)" + Write-Host " Vault URI: $($keyVault.vaultUri)" + Write-Host " SKU: $($keyVault.sku)" + Write-Host " Status: Configured" + } else { + Write-Host " Status: Not found or not accessible" + } + + Write-Host "SUCCESS: ${{ parameters.DeploymentEnvironment }} Grafana deployment verification completed" \ No newline at end of file diff --git a/eng/setup-grafana-api-token.ps1 b/eng/setup-grafana-api-token.ps1 new file mode 100644 index 000000000..419b7b817 --- /dev/null +++ b/eng/setup-grafana-api-token.ps1 @@ -0,0 +1,255 @@ +#!/usr/bin/env pwsh + +param( + [Parameter(Mandatory=$true)] + [ValidateSet("Staging", "Production")] + [string]$Environment, + + [Parameter(Mandatory=$false)] + [string]$ApiToken, + + [Parameter(Mandatory=$true)] + [string]$KeyVaultName +) + +Set-StrictMode -Version Latest +$ErrorActionPreference = "Stop" + +# Determine workspace and Key Vault names +$workspaceName = if ($Environment -eq "Production") { "dnceng-grafana" } else { "dnceng-grafana-staging" } +$resourceGroup = "monitoring-managed" +$keyVaultName = $KeyVaultName +$tokenSecretName = "grafana-admin-api-key" + +Write-Host "==========================================" +Write-Host "Setup Grafana API Token" +Write-Host "==========================================" +Write-Host "Environment: $Environment" +Write-Host "Workspace: $workspaceName" +Write-Host "Key Vault: $keyVaultName" +Write-Host "Secret Name: $tokenSecretName" +Write-Host "" + +# Get Grafana endpoint +Write-Host "Getting Grafana workspace endpoint..." +$grafanaInfo = az grafana show --name $workspaceName --resource-group $resourceGroup --query "{endpoint:properties.endpoint, status:properties.provisioningState}" -o json | ConvertFrom-Json + +if (-not $grafanaInfo -or $grafanaInfo.status -ne "Succeeded") { + Write-Error "Grafana workspace '$workspaceName' is not ready. Status: $($grafanaInfo.status)" + exit 1 +} + +$grafanaEndpoint = $grafanaInfo.endpoint +Write-Host "✓ Grafana Endpoint: $grafanaEndpoint" +Write-Host "" + +# Check if token already exists +Write-Host "Checking if API token already exists in Key Vault..." +$existingToken = az keyvault secret show --vault-name $keyVaultName --name $tokenSecretName --query "value" -o tsv 2>$null + +if ($existingToken) { + Write-Host "✓ Found existing token in Key Vault" + Write-Host "" + Write-Host "Validating token..." + + # Test if the token is still valid by calling Grafana API + $headers = @{ + "Authorization" = "Bearer $existingToken" + "Content-Type" = "application/json" + } + + try { + # Test the token by getting org info (lightweight API call) + $testResponse = Invoke-RestMethod -Uri "$grafanaEndpoint/api/org" -Method Get -Headers $headers -ErrorAction Stop + Write-Host "✓ Token is valid and working!" + Write-Host " Organization: $($testResponse.name)" + Write-Host "" + Write-Host "Using existing token. No need to create a new one." + Write-Host "" + Write-Host "==========================================" + Write-Host "✓ Setup Complete!" + Write-Host "==========================================" + Write-Host "" + Write-Host "The existing API token in Key Vault is valid." + Write-Host " Key Vault: $keyVaultName" + Write-Host " Secret: $tokenSecretName" + Write-Host "" + Write-Host "The pipeline can publish dashboards to:" + Write-Host " $grafanaEndpoint" + Write-Host "" + exit 0 + } catch { + Write-Host "⚠ Existing token is invalid or expired" + Write-Host " Error: $($_.Exception.Message)" + Write-Host "" + Write-Host "A new token will be created..." + Write-Host "" + } +} + +# Get API token if not provided +if (-not $ApiToken) { + Write-Host "==========================================" + Write-Host "Automated Service Account Creation" + Write-Host "==========================================" + Write-Host "" + + # Check if AMG extension is installed + Write-Host "Checking Azure CLI Grafana extension..." + $amgExtension = az extension list --query "[?name=='amg'].version" -o tsv + if (-not $amgExtension) { + Write-Host "Installing Azure Managed Grafana CLI extension..." + az extension add --name amg --only-show-errors + Write-Host "✓ Extension installed" + } else { + Write-Host "✓ Azure Managed Grafana extension already installed (version $amgExtension)" + } + Write-Host "" + + # Create service account using Azure CLI + Write-Host "Creating service account 'grafana-admin'..." + Write-Host "Workspace: $workspaceName" + Write-Host "Resource Group: $resourceGroup" + Write-Host "" + + $serviceAccountJson = az grafana service-account create ` + --name $workspaceName ` + --resource-group $resourceGroup ` + --service-account "grafana-admin" ` + --role "Admin" ` + -o json 2>&1 + + if ($LASTEXITCODE -ne 0) { + # Check if it already exists + if ($serviceAccountJson -like "*already exists*" -or $serviceAccountJson -like "*409*" -or $serviceAccountJson -like "*Conflict*") { + Write-Host "⚠ Service account 'grafana-admin' already exists, retrieving it..." + + $listJson = az grafana service-account list ` + --name $workspaceName ` + --resource-group $resourceGroup ` + -o json 2>&1 + + if ($LASTEXITCODE -ne 0) { + Write-Error "Failed to list service accounts:" + Write-Host $listJson + Write-Host "" + Write-Host "This may be a permissions issue. Ensure the pipeline has access to the Grafana workspace." + exit 1 + } + + $serviceAccounts = $listJson | ConvertFrom-Json + $serviceAccount = $serviceAccounts | Where-Object { $_.name -eq "grafana-admin" } | Select-Object -First 1 + + if (-not $serviceAccount) { + Write-Error "Failed to find existing service account 'grafana-admin'" + Write-Host "Available service accounts:" + $serviceAccounts | ForEach-Object { Write-Host " - $($_.name) (ID: $($_.id))" } + exit 1 + } + + $serviceAccountId = $serviceAccount.id + Write-Host "✓ Found existing service account with ID: $serviceAccountId" + } else { + Write-Error "Failed to create service account. Details:" + Write-Host "" + Write-Host "Error output:" + Write-Host $serviceAccountJson + Write-Host "" + Write-Host "Common causes:" + Write-Host " 1. Insufficient permissions - Pipeline needs Grafana Admin role" + Write-Host " 2. Grafana workspace not ready - Wait a few minutes and retry" + Write-Host " 3. Network connectivity issues" + Write-Host "" + Write-Host "To grant Grafana Admin role to the pipeline service principal:" + Write-Host " az role assignment create \" + Write-Host " --role 'Grafana Admin' \" + Write-Host " --assignee \" + Write-Host " --scope /subscriptions//resourceGroups/$resourceGroup/providers/Microsoft.Dashboard/grafana/$workspaceName" + exit 1 + } + } else { + $serviceAccount = $serviceAccountJson | ConvertFrom-Json + $serviceAccountId = $serviceAccount.id + Write-Host "✓ Service account created with ID: $serviceAccountId" + } + + Write-Host "" + + # Create service account token (expires in 30 days = 2592000 seconds) + Write-Host "Creating service account token (expires in 30 days)..." + + $tokenName = "ci-cd-token-$(Get-Date -Format 'yyyyMMdd-HHmmss')" + + $tokenJson = az grafana service-account token create ` + --name $workspaceName ` + --resource-group $resourceGroup ` + --service-account $serviceAccountId ` + --token $tokenName ` + --time-to-live "30d" ` + -o json + + if ($LASTEXITCODE -ne 0) { + Write-Error "Failed to create service account token:" + Write-Host $tokenJson + exit 1 + } + + $tokenResponse = $tokenJson | ConvertFrom-Json + $ApiToken = $tokenResponse.key + + Write-Host "✓ Service account token created" + Write-Host " Token name: $tokenName" + Write-Host " Token ID: $($tokenResponse.id)" + Write-Host " Expires in: 30 days (2592000 seconds)" + Write-Host "" +} + +# Validate token format (Grafana service account tokens start with "glsa_") +if (-not $ApiToken.StartsWith("glsa_")) { + Write-Warning "Token doesn't start with 'glsa_' - this might not be a service account token" + $continue = Read-Host "Continue anyway? (y/N)" + if ($continue -ne "y" -and $continue -ne "Y") { + Write-Host "Aborted." + exit 1 + } +} + +# Store in Key Vault +Write-Host "" +Write-Host "Storing API token in Key Vault..." +Write-Host " Key Vault: $keyVaultName" + +try { + az keyvault secret set ` + --vault-name $keyVaultName ` + --name $tokenSecretName ` + --value $ApiToken ` + --output none + + Write-Host "✓ Token stored successfully in Key Vault" +} catch { + Write-Error "Failed to store token in Key Vault: $_" + Write-Host "" + Write-Host "Make sure the pipeline service principal has the following permissions on the Key Vault:" + Write-Host "- Key Vault Secrets Officer (RBAC role)" + Write-Host "" + Write-Host "This should be automatically granted during the ProvisionApplicationGateway stage." + Write-Host "If running manually, you can grant yourself access with:" + Write-Host "az role assignment create --role 'Key Vault Secrets Officer' \" + Write-Host " --assignee \" + Write-Host " --scope /subscriptions//resourceGroups/$resourceGroup/providers/Microsoft.KeyVault/vaults/$keyVaultName" + exit 1 +} + +Write-Host "" +Write-Host "==========================================" +Write-Host "✓ Setup Complete!" +Write-Host "==========================================" +Write-Host "" +Write-Host "The API token has been stored in:" +Write-Host " Key Vault: $keyVaultName" +Write-Host " Secret: $tokenSecretName" +Write-Host "" +Write-Host "The pipeline can now publish dashboards to:" +Write-Host " $grafanaEndpoint" +Write-Host "" diff --git a/src/DotNet.Status.Web/DotNet.Status.Web/Controllers/AnnotationsController.cs b/src/DotNet.Status.Web/DotNet.Status.Web/Controllers/AnnotationsController.cs index 9cacfd777..329bfec26 100644 --- a/src/DotNet.Status.Web/DotNet.Status.Web/Controllers/AnnotationsController.cs +++ b/src/DotNet.Status.Web/DotNet.Status.Web/Controllers/AnnotationsController.cs @@ -142,4 +142,87 @@ public async Task>> Post(AnnotationQue return annotationEntries; } + + [HttpPost] + [HttpGet] + [Route("grafana")] + public async Task>> GetGrafanaAnnotations( + [FromBody(EmptyBodyBehavior = Microsoft.AspNetCore.Mvc.ModelBinding.EmptyBodyBehavior.Allow)] GrafanaAnnotationQuery query, + [FromQuery] string from, + [FromQuery] string to, + CancellationToken cancellationToken) + { + DateTime fromDate, toDate; + + if (query?.Range != null) + { + // POST request with body + fromDate = query.Range.From; + toDate = query.Range.To; + } + else if (!string.IsNullOrEmpty(from) && !string.IsNullOrEmpty(to)) + { + // GET request with query parameters + if (!DateTime.TryParse(from, out fromDate) || !DateTime.TryParse(to, out toDate)) + { + return BadRequest("Invalid date format"); + } + } + else + { + return BadRequest("Missing date range"); + } + + IEnumerable services = (query?.Annotation?.Query?.Split(',') ?? Array.Empty()) + .Where(s => !string.IsNullOrWhiteSpace(s)) + .Select(s => s.Trim()); + + if (services.Count() > _maximumServerCount) + { + return new List(); + } + + StringBuilder filterBuilder = new StringBuilder(); + filterBuilder.Append($"Started gt datetime'{fromDate:O}' and Ended lt datetime'{toDate:O}'"); + if (services.Any()) + { + filterBuilder.Append(" and ("); + filterBuilder.Append(string.Join(" or ", services.Select(s => $"PartitionKey eq '{s}'"))); + filterBuilder.Append(')'); + } + + string filter = filterBuilder.ToString(); + _logger.LogTrace("Compiled Grafana annotation filter query: {Query}", filter); + + TableClient tableClient = await GetCloudTable(); + IAsyncEnumerable entityQuery = tableClient.QueryAsync( + filter: filter, + cancellationToken: cancellationToken); + + List annotations = new List(); + await foreach (DeploymentEntity entity in entityQuery) + { + if (entity.Started == null && entity.Ended == null) + { + continue; + } + + var annotation = new GrafanaAnnotation + { + Time = entity.Started?.ToUnixTimeMilliseconds() ?? entity.Ended.Value.ToUnixTimeMilliseconds(), + Title = $"Deployment of {entity.Service}", + Tags = new[] { "deployment", "deploy", $"deploy-{entity.Service}", entity.Service }, + Text = $"Service: {entity.Service}" + }; + + if (entity.Started != null && entity.Ended != null) + { + annotation.TimeEnd = entity.Ended.Value.ToUnixTimeMilliseconds(); + } + + annotations.Add(annotation); + } + + return annotations; + } } diff --git a/src/DotNet.Status.Web/DotNet.Status.Web/Models/GrafanaAnnotation.cs b/src/DotNet.Status.Web/DotNet.Status.Web/Models/GrafanaAnnotation.cs new file mode 100644 index 000000000..914987cf8 --- /dev/null +++ b/src/DotNet.Status.Web/DotNet.Status.Web/Models/GrafanaAnnotation.cs @@ -0,0 +1,13 @@ +namespace DotNet.Status.Web.Models; +public class GrafanaAnnotation +{ + public long Time { get; set; } + + public long? TimeEnd { get; set; } + + public string Title { get; set; } + + public string[] Tags { get; set; } + + public string Text { get; set; } +} diff --git a/src/DotNet.Status.Web/DotNet.Status.Web/Models/GrafanaAnnotationQuery.cs b/src/DotNet.Status.Web/DotNet.Status.Web/Models/GrafanaAnnotationQuery.cs new file mode 100644 index 000000000..49053924d --- /dev/null +++ b/src/DotNet.Status.Web/DotNet.Status.Web/Models/GrafanaAnnotationQuery.cs @@ -0,0 +1,23 @@ +using System; + +namespace DotNet.Status.Web.Models; +public class GrafanaAnnotationQuery +{ + public AnnotationQueryRange Range { get; set; } + public AnnotationDefinition Annotation { get; set; } +} + +public class AnnotationQueryRange +{ + public DateTime From { get; set; } + public DateTime To { get; set; } +} + +public class AnnotationDefinition +{ + public string Name { get; set; } + public string Datasource { get; set; } + public bool Enable { get; set; } + public string IconColor { get; set; } + public string Query { get; set; } +} diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/cores-consumption.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/cores-consumption.alert.json new file mode 100644 index 000000000..d548c5370 --- /dev/null +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/cores-consumption.alert.json @@ -0,0 +1,118 @@ +{ + "uid": "cores-consumption", + "title": "Cores consumption", + "condition": "C", + "data": [ + { + "refId": "A", + "queryType": "Azure Log Analytics", + "datasourceUid": "F2XodEi7z", + "relativeTimeRange": { + "from": 300, + "to": 0 + }, + "model": { + "azureLogAnalytics": { + "query": "let quotaPerSubscription = customEvents \n| where $__timeFilter(timestamp)\n| where name == \"AzureSubscriptionQuotaLimit\"\n| project \n quota = toint(customMeasurements.quota),\n subscription = tostring(customDimensions.subscriptionId),\n timestamp\n| summarize arg_max(timestamp, quota) by subscription\n| project quota, subscription;\ncustomEvents\n| where $__timeFilter(timestamp)\n| where name == 'AzureResourceUsage'\n| where customDimensions.name == \"standardDv3Family\" or customDimensions.name == \"standardDAv4Family\"\n| project \n cores = toreal(customMeasurements.current),\n subscription = tostring(customDimensions.subscription),\n timestamp\n| join kind=inner quotaPerSubscription on subscription\n| project ['limit'] = quota, cores, timestamp, subscription\n| extend ['percent of limit'] = iff(['limit'] == 0, 0.0, cores/['limit'] * 100)\n| summarize ['percent of limit'] = max(['percent of limit']) by bin(timestamp, $__interval), subscription\n| order by timestamp asc", + "resources": [ + "[parameter(dotnet-eng-appinsights-resourcepath)]" + ], + "resultFormat": "time_series", + "workspace": "[parameter(default-workspace-resourcepath)]" + }, + "azureMonitor": { + "dimensionFilter": "*", + "dimensionFilters": [], + "timeGrain": "auto", + "top": "10" + }, + "datasource": { + "type": "grafana-azure-monitor-datasource", + "uid": "F2XodEi7z" + }, + "queryType": "Azure Log Analytics", + "refId": "A", + "subscription": "[parameter(dotnet-eng-appinsights-subscriptionid)]", + "subscriptions": [ + "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", + "cab65fc3-d077-467d-931f-3932eabf36d3" + ] + } + }, + { + "refId": "B", + "queryType": "", + "datasourceUid": "-100", + "model": { + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "A", + "reducer": "mean", + "refId": "B", + "type": "reduce" + }, + "relativeTimeRange": { + "from": 300, + "to": 0 + } + }, + { + "refId": "C", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [ + 95 + ], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "B" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "B", + "refId": "C", + "type": "threshold" + } + } + ], + "noDataState": "KeepLast", + "execErrState": "KeepLast", + "for": "5m", + "frequency": "1m", + "annotations": { + "__dashboardUid__": "quota", + "__panelId__": "15", + "description": "Cores consumption by Autoscaler is above 95% of limit" + }, + "labels": { + "NotificationId": "66b2ef8da5c74a2fbbc7d6739f55e4e8" + }, + "__dashboardUid__": "quota", + "__panelId__": "15", + "folderUID": "arcade-services", + "ruleGroup": "Azure Quota Alerts", + "intervalMs": 900000, + "isPaused": true, + "notification_settings": { + "receiver": ".NET Status Alert", + "group_wait": "5m", + "repeat_interval": "4h" + } +} \ No newline at end of file diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/dotneteng-status-failed-requests.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/dotneteng-status-failed-requests.alert.json new file mode 100644 index 000000000..0ec5bb980 --- /dev/null +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/dotneteng-status-failed-requests.alert.json @@ -0,0 +1,119 @@ +{ + "uid": "dotneteng-status-failed-requests", + "title": "DotNetEng Status Failed Requests/Hour alert", + "condition": "C", + "data": [ + { + "refId": "A", + "queryType": "Azure Log Analytics", + "datasourceUid": "F2XodEi7z", + "model": { + "azureLogAnalytics": { + "dashboardTime": true, + "query": "let r = requests | where $__timeFilter(timestamp);\nlet f = coalesce(toscalar(r | summarize min(timestamp)), ago(1d));\nlet t = coalesce(toscalar(r | summarize max(timestamp)), now());\nlet span=(t - f)/60;\nlet interval=case(span >= 1d, bin(span, 1d), span >= 1h, bin(span, 1h), 15m);\nlet intervalHours = interval / 1h;\nr\n| where success == false\n| make-series kind=nonempty valueCount=count() default=0 on timestamp in range(f, t, interval)\n| mv-expand timestamp to typeof(datetime), valueCount to typeof(double)\n| project timestamp, failuresCount=valueCount/intervalHours", + "resources": [ + "/subscriptions/68672ab8-de0c-40f1-8d1b-ffb20bd62c0f/resourceGroups/monitoring/providers/microsoft.insights/components/DotNetEng-Status-Prod" + ], + "resultFormat": "time_series", + "workspace": "/subscriptions/68672ab8-de0c-40f1-8d1b-ffb20bd62c0f/resourcegroups/defaultresourcegroup-eus/providers/microsoft.operationalinsights/workspaces/defaultworkspace-68672ab8-de0c-40f1-8d1b-ffb20bd62c0f-eus" + }, + "azureMonitor": { + "dimensionFilter": "*", + "dimensionFilters": [], + "timeGrain": "auto", + "top": "10" + }, + "datasource": { + "type": "grafana-azure-monitor-datasource", + "uid": "F2XodEi7z" + }, + "queryType": "Azure Log Analytics", + "refId": "A", + "subscription": "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", + "subscriptions": [ + "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", + "cab65fc3-d077-467d-931f-3932eabf36d3" + ] + }, + "relativeTimeRange": { + "from": 3600, + "to": 600 + } + }, + { + "refId": "B", + "queryType": "", + "datasourceUid": "-100", + "model": { + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "A", + "reducer": "mean", + "refId": "B", + "type": "reduce" + }, + "relativeTimeRange": { + "from": 3600, + "to": 600 + } + }, + { + "refId": "C", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [ + 20 + ], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "B" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "B", + "refId": "C", + "type": "threshold" + } + } + ], + "noDataState": "OK", + "execErrState": "KeepLast", + "for": "5m", + "frequency": "1m", + "annotations": { + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "44", + "description": "The number of failed DotNetEng Status requests per hour is above 20. This may indicate a systemic problem that needs to be investigated.\\nTo intially investigate prod, run the following query in DotNetEng-Status-Prod, and to investigate staging, run the query in DotNetEng-Status-Staging:\\n\\n```\\nunion exceptions, traces\\n| project timestamp, operation_Name, customDimensions, message, problemId, details\\n| order by timestamp asc\\n```" + }, + "labels": { + "NotificationId": "d2dd705a6c724ed68fcf6955561c06dd" + }, + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "44", + "folderUID": "arcade-services", + "ruleGroup": "DotNetEng Status Alerts", + "intervalMs": 900000, + "isPaused": false, + "notification_settings": { + "receiver": ".NET Status Alert", + "group_wait": "5m", + "repeat_interval": "4h" + } +} \ No newline at end of file diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/helix-api-availability.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/helix-api-availability.alert.json new file mode 100644 index 000000000..7923c81d5 --- /dev/null +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/helix-api-availability.alert.json @@ -0,0 +1,144 @@ +{ + "uid": "helix-api-availability", + "title": "Helix API availability", + "condition": "C", + "data": [ + { + "refId": "A", + "queryType": "Azure Monitor", + "datasourceUid": "F2XodEi7z", + "relativeTimeRange": { + "from": 1800, + "to": 0 + }, + "model": { + "azureMonitor": { + "aggregation": "Average", + "alias": "{{ availabilityresult/location }}", + "allowedTimeGrainsMs": [ + 60000, + 300000, + 900000, + 1800000, + 3600000, + 21600000, + 43200000, + 86400000 + ], + "dimensionFilter": "*", + "dimensionFilters": [ + { + "dimension": "availabilityResult/name", + "filter": "Helix API", + "operator": "eq" + }, + { + "dimension": "availabilityResult/location", + "filter": "*", + "operator": "eq" + } + ], + "metricDefinition": "microsoft.insights/components", + "metricName": "availabilityResults/availabilityPercentage", + "metricNamespace": "microsoft.insights/components", + "resources": [ + { + "metricNamespace": "microsoft.insights/components", + "resourceGroup": "helixinfrarg", + "resourceName": "helix-prod", + "subscription": "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f" + } + ], + "timeGrain": "auto", + "top": "10" + }, + "datasource": { + "type": "grafana-azure-monitor-datasource", + "uid": "F2XodEi7z" + }, + "queryType": "Azure Monitor", + "refId": "A", + "subscription": "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", + "subscriptions": [ + "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", + "cab65fc3-d077-467d-931f-3932eabf36d3" + ] + } + }, + { + "refId": "B", + "queryType": "", + "datasourceUid": "-100", + "model": { + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "A", + "reducer": "mean", + "refId": "B", + "type": "reduce" + }, + "relativeTimeRange": { + "from": 1800, + "to": 0 + } + }, + { + "refId": "C", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [ + 99 + ], + "type": "lt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "B" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "B", + "refId": "C", + "type": "threshold" + } + } + ], + "noDataState": "KeepLast", + "execErrState": "KeepLast", + "for": "5m", + "frequency": "1m", + "annotations": { + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "18", + "description": "Helix API availability alert!" + }, + "labels": { + "NotificationId": "6179576701874a7abc440a574cf636d0" + }, + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "18", + "folderUID": "arcade-services", + "ruleGroup": "Helix Alerts", + "intervalMs": 900000, + "isPaused": false, + "notification_settings": { + "receiver": ".NET Status Alert", + "group_wait": "5m", + "repeat_interval": "4h" + } +} \ No newline at end of file diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/helix-api-average-response-time.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/helix-api-average-response-time.alert.json new file mode 100644 index 000000000..c70a2adcb --- /dev/null +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/helix-api-average-response-time.alert.json @@ -0,0 +1,133 @@ +{ + "uid": "helix-api-average-response-time", + "title": "Helix API Average Response Time", + "condition": "C", + "data": [ + { + "refId": "A", + "queryType": "Azure Monitor", + "datasourceUid": "F2XodEi7z", + "relativeTimeRange": { + "from": 300, + "to": 0 + }, + "model": { + "azureMonitor": { + "aggregation": "Average", + "alias": "", + "allowedTimeGrainsMs": [ + 60000, + 300000, + 900000, + 1800000, + 3600000, + 21600000, + 43200000, + 86400000 + ], + "dimensionFilter": "*", + "dimensionFilters": [], + "metricDefinition": "Microsoft.Insights/components", + "metricName": "requests/duration", + "metricNamespace": "microsoft.insights/components", + "resources": [ + { + "metricNamespace": "microsoft.insights/components", + "resourceGroup": "dotnet-eng-cluster", + "resourceName": "dotnet-eng", + "subscription": "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f" + } + ], + "timeGrain": "auto", + "top": "10" + }, + "datasource": { + "type": "grafana-azure-monitor-datasource", + "uid": "F2XodEi7z" + }, + "queryType": "Azure Monitor", + "refId": "A", + "subscription": "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", + "subscriptions": [ + "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", + "cab65fc3-d077-467d-931f-3932eabf36d3" + ] + } + }, + { + "refId": "B", + "queryType": "", + "datasourceUid": "-100", + "model": { + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "A", + "reducer": "mean", + "refId": "B", + "type": "reduce" + }, + "relativeTimeRange": { + "from": 300, + "to": 0 + } + }, + { + "refId": "C", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [ + 5000 + ], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "B" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "B", + "refId": "C", + "type": "threshold" + } + } + ], + "noDataState": "KeepLast", + "execErrState": "KeepLast", + "for": "5m", + "frequency": "1m", + "annotations": { + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "19", + "description": "Helix API Average Response Time is high!" + }, + "labels": { + "NotificationId": "24cae10d9eca44079e7cf3d47f148497" + }, + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "19", + "folderUID": "arcade-services", + "ruleGroup": "Helix Alerts", + "intervalMs": 900000, + "isPaused": false, + "notification_settings": { + "receiver": ".NET Status Alert", + "group_wait": "5m", + "repeat_interval": "4h" + } +} diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/helix-autoscaler-service-stopped.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/helix-autoscaler-service-stopped.alert.json new file mode 100644 index 000000000..0ed88673b --- /dev/null +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/helix-autoscaler-service-stopped.alert.json @@ -0,0 +1,141 @@ +{ + "uid": "helix-autoscaler-service-stopped", + "title": "Helix AutoScaler Service Stopped Running", + "condition": "C", + "data": [ + { + "refId": "A", + "queryType": "Azure Monitor", + "datasourceUid": "F2XodEi7z", + "relativeTimeRange": { + "from": 1800, + "to": 0 + }, + "model": { + "azureMonitor": { + "aggregation": "Count", + "alias": "{{cloud/RoleName}}", + "allowedTimeGrainsMs": [ + 60000, + 300000, + 900000, + 1800000, + 3600000, + 21600000, + 43200000, + 86400000 + ], + "dimensionFilter": "*", + "dimensionFilters": [ + { + "dimension": "cloud/roleName", + "filter": "fabric:/Helix/AutoScaleActorService", + "operator": "eq" + } + ], + "metricDefinition": "Microsoft.Insights/components", + "metricName": "traces/count", + "metricNamespace": "microsoft.insights/components", + "region": "westus2", + "resources": [ + { + "metricNamespace": "microsoft.insights/components", + "region": "westus2", + "resourceGroup": "dotnet-eng-cluster", + "resourceName": "dotnet-eng", + "subscription": "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f" + } + ], + "timeGrain": "auto", + "top": "100" + }, + "datasource": { + "type": "grafana-azure-monitor-datasource", + "uid": "F2XodEi7z" + }, + "queryType": "Azure Monitor", + "refId": "A", + "subscription": "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", + "subscriptions": [ + "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", + "cab65fc3-d077-467d-931f-3932eabf36d3" + ] + } + }, + { + "refId": "B", + "queryType": "", + "datasourceUid": "-100", + "relativeTimeRange": { + "from": 1800, + "to": 0 + }, + "model": { + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "A", + "reducer": "mean", + "refId": "B", + "type": "reduce" + } + }, + { + "refId": "C", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [ + 1 + ], + "type": "lt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "B" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "B", + "refId": "C", + "type": "threshold" + } + } + ], + "noDataState": "Alerting", + "execErrState": "KeepLast", + "for": "5m", + "frequency": "1m", + "annotations": { + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "29", + "description": "Helix AutoScaler Service has stopped running - no traces detected in the last 30 minutes." + }, + "labels": { + "NotificationId": "6213d3c5ce9a46278343bf075798e46f" + }, + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "29", + "folderUID": "arcade-services", + "ruleGroup": "Helix Alerts", + "intervalMs": 900000, + "isPaused": false, + "notification_settings": { + "receiver": ".NET Status Alert", + "group_wait": "5m", + "repeat_interval": "4h" + } +} \ No newline at end of file diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-background-worker-stopped.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-background-worker-stopped.alert.json new file mode 100644 index 000000000..03514f745 --- /dev/null +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-background-worker-stopped.alert.json @@ -0,0 +1,141 @@ +{ + "uid": "pcs-background-worker-stopped", + "title": "PCS Background Worker Stopped", + "condition": "D", + "data": [ + { + "refId": "A", + "queryType": "Azure Log Analytics", + "datasourceUid": "F2XodEi7z", + "relativeTimeRange": { + "from": 300, + "to": 0 + }, + "model": { + "azureLogAnalytics": { + "dashboardTime": true, + "query": "customEvents\r\n| where name == \"WorkItemExecuted\" and $__timeFilter(timestamp)\r\n| extend Type=tostring(customDimensions[\"WorkItemType\"])\r\n| summarize Count=count() by bin(timestamp, $__interval), Type=replace_string(Type, \"WorkItem\", \"\")\r\n| order by timestamp asc", + "resources": [ + "[parameter(product-construction-service-appinsights-resourcepath)]" + ], + "resultFormat": "time_series", + "workspace": "[parameter(product-construction-service-workspace-resourcepath)]" + }, + "azureMonitor": { + "dimensionFilters": [], + "timeGrain": "auto" + }, + "datasource": { + "type": "grafana-azure-monitor-datasource", + "uid": "F2XodEi7z" + }, + "queryType": "Azure Log Analytics", + "refId": "A", + "subscription": "[parameter(dotnet-eng-appinsights-subscriptionid)]" + } + }, + { + "refId": "B", + "queryType": "Azure Log Analytics", + "datasourceUid": "F2XodEi7z", + "relativeTimeRange": { + "from": 300, + "to": 0 + }, + "model": { + "azureLogAnalytics": { + "dashboardTime": true, + "query": "customEvents\r\n| where name == \"WorkItemExecuted\"\r\n| summarize TotalCount=count() by bin(timestamp, $__interval)\r\n| order by timestamp asc", + "resources": [ + "[parameter(product-construction-service-appinsights-resourcepath)]" + ], + "resultFormat": "time_series", + "workspace": "[parameter(product-construction-service-workspace-resourcepath)]" + }, + "azureMonitor": { + "dimensionFilters": [], + "timeGrain": "auto" + }, + "datasource": { + "type": "grafana-azure-monitor-datasource", + "uid": "F2XodEi7z" + }, + "hide": false, + "queryType": "Azure Log Analytics", + "refId": "B", + "subscription": "[parameter(dotnet-eng-appinsights-subscriptionid)]" + } + }, + { + "refId": "C", + "queryType": "", + "datasourceUid": "-100", + "relativeTimeRange": { + "from": 300, + "to": 0 + }, + "model": { + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "B", + "reducer": "mean", + "refId": "C", + "type": "reduce" + } + }, + { + "refId": "D", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [20], + "type": "lt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": ["C"] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "C", + "refId": "D", + "type": "threshold" + } + } + ], + "noDataState": "Alerting", + "execErrState": "Alerting", + "for": "5m", + "frequency": "1m", + "annotations": { + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "57", + "description": "[!IMPORTANT]\n[Description and instructions for this alert](https://dev.azure.com/dnceng/internal/_wiki/wikis/DNCEng%20Services%20Wiki/1308/-Alert-PCS-Background-Worker-Stopped)\n\nPCS appears to have stopped processing new WorkItems.\n\n@dotnet/prodconsvcs" + }, + "labels": { + "NotificationId": "23909d48866646408f669cc1c3d325ee" + }, + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "57", + "folderUID": "arcade-services", + "ruleGroup": "PCS Alerts", + "intervalMs": 900000, + "isPaused": false, + "notification_settings": { + "receiver": ".NET Status Alert", + "group_wait": "5m", + "repeat_interval": "4h" + } +} diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-container-job-execution-failures.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-container-job-execution-failures.alert.json new file mode 100644 index 000000000..8c6054e8a --- /dev/null +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-container-job-execution-failures.alert.json @@ -0,0 +1,124 @@ +{ + "uid": "pcs-container-job-execution-failures", + "title": "Container job execution failures alert", + "condition": "C", + "data": [ + { + "refId": "A", + "queryType": "Azure Log Analytics", + "datasourceUid": "F2XodEi7z", + "relativeTimeRange": { + "from": 300, + "to": 0 + }, + "model": { + "azureLogAnalytics": { + "dashboardTime": true, + "query": "ContainerAppSystemLogs_CL\r\n| where TimeGenerated > ago(14d)\r\n| where Log_s has_any (\"has exited with status Succeeded\", \"has exited with status Failed\")\r\n| summarize arg_max(TimeGenerated, Log_s) by JobName_s\r\n| where Log_s has \"has exited with status Failed\"\r\n| project TimeGenerated,JobName=JobName_s, FailedJob=1", + "resources": [ + "[parameter(product-construction-service-workspace-resourcepath)]" + ], + "resultFormat": "time_series", + "workspace": "[parameter(product-construction-service-workspace-resourcepath)]" + }, + "azureMonitor": { + "dimensionFilters": [], + "metricNamespace": "Microsoft.OperationalInsights/workspaces", + "region": "westus2", + "resources": [ + { + "metricNamespace": "Microsoft.OperationalInsights/workspaces", + "region": "westus2", + "resourceGroup": "[parameter(product-construction-service-resourcegroup)]", + "resourceName": "[parameter(product-construction-service-workspace-resourcename)]", + "subscription": "[parameter(dotnet-product-construction-services-subscriptionid)]" + } + ], + "timeGrain": "auto" + }, + "datasource": { + "type": "grafana-azure-monitor-datasource", + "uid": "F2XodEi7z" + }, + "queryType": "Azure Log Analytics", + "refId": "A", + "subscription": "[parameter(dotnet-product-construction-services-subscriptionid)]", + "subscriptions": [ + "fbd6122a-9ad3-42e4-976e-bccb82486856", + "e6b5f9f5-0ca4-4351-879b-014d78400ec2" + ] + } + }, + { + "refId": "B", + "queryType": "", + "datasourceUid": "-100", + "model": { + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "A", + "reducer": "max", + "refId": "B", + "type": "reduce" + }, + "relativeTimeRange": { + "from": 300, + "to": 0 + } + }, + { + "refId": "C", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [0], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": ["B"] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "B", + "refId": "C", + "type": "threshold" + } + } + ], + "noDataState": "OK", + "execErrState": "KeepLast", + "for": "5m", + "frequency": "1m", + "annotations": { + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "74", + "description": "[Description and instructions for this alert](https://dev.azure.com/dnceng/internal/_wiki/wikis/DNCEng%20Services%20Wiki/1350/-Alert-PCS-container-job-execution-failing)\\n\\nPlease note that this alert will fire every 12 hours as the list of failed jobs can change" + }, + "labels": { + "NotificationId": "0a5c68b0daf846ef83a66c6c70fd24ad" + }, + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "74", + "folderUID": "arcade-services", + "ruleGroup": "PCS Alerts", + "intervalMs": 900000, + "isPaused": false, + "notification_settings": { + "receiver": ".NET Status Alert (no image, 12h reminder)", + "group_wait": "5m", + "repeat_interval": "12h" + } +} diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-disk-space-issues.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-disk-space-issues.alert.json new file mode 100644 index 000000000..aa886e7fd --- /dev/null +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-disk-space-issues.alert.json @@ -0,0 +1,113 @@ +{ + "uid": "pcs-disk-space-issues", + "title": "PCS Disk Space Issues alert", + "condition": "C", + "data": [ + { + "refId": "A", + "queryType": "Azure Log Analytics", + "datasourceUid": "F2XodEi7z", + "relativeTimeRange": { + "from": 300, + "to": 0 + }, + "model": { + "azureLogAnalytics": { + "dashboardTime": true, + "query": "exceptions\r\n| where timestamp > now()-6h and (outerMessage contains \"No space left on device\" or innermostMessage contains \"No space left on device\")\r\n| summarize TotalCount=count() by bin(timestamp, 1h)\r\n| order by timestamp asc", + "resources": [ + "[parameter(product-construction-service-appinsights-resourcepath)]" + ], + "resultFormat": "time_series", + "workspace": "[parameter(product-construction-service-workspace-resourcepath)]" + }, + "azureMonitor": { + "dimensionFilters": [], + "timeGrain": "auto" + }, + "datasource": { + "type": "grafana-azure-monitor-datasource", + "uid": "F2XodEi7z" + }, + "queryType": "Azure Log Analytics", + "refId": "A", + "subscription": "[parameter(dotnet-product-construction-services-subscriptionid)]", + "subscriptions": [ + "fbd6122a-9ad3-42e4-976e-bccb82486856", + "e6b5f9f5-0ca4-4351-879b-014d78400ec2" + ] + } + }, + { + "refId": "B", + "queryType": "", + "datasourceUid": "-100", + "model": { + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "A", + "reducer": "count", + "refId": "B", + "type": "reduce" + }, + "relativeTimeRange": { + "from": 300, + "to": 0 + } + }, + { + "refId": "C", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [0], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": ["B"] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "B", + "refId": "C", + "type": "threshold" + } + } + ], + "noDataState": "OK", + "execErrState": "KeepLast", + "for": "5m", + "frequency": "5m", + "annotations": { + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "72", + "description": "[!IMPORTANT]\n[Description and instructions for this alert](https://dev.azure.com/dnceng/internal/_wiki/wikis/DNCEng%20Services%20Wiki/1346/-Alert-PCS-Disk-Space-Issues)\n\nThe PCS service is running out of disk space.\n\n@dotnet/prodconsvcs" + }, + "labels": { + "NotificationId": "aa1fe025a8954b6cad9866354ca041ee" + }, + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "72", + "folderUID": "arcade-services", + "ruleGroup": "PCS Alerts", + "intervalMs": 900000, + "isPaused": false, + "notification_settings": { + "receiver": ".NET Status Alert", + "group_wait": "5m", + "repeat_interval": "4h" + } +} diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-exceptions-high.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-exceptions-high.alert.json new file mode 100644 index 000000000..114fe14be --- /dev/null +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-exceptions-high.alert.json @@ -0,0 +1,113 @@ +{ + "uid": "pcs-exceptions-high", + "title": "PCS Exceptions High", + "condition": "C", + "data": [ + { + "refId": "A", + "queryType": "Azure Log Analytics", + "datasourceUid": "F2XodEi7z", + "relativeTimeRange": { + "from": 300, + "to": 0 + }, + "model": { + "azureLogAnalytics": { + "dashboardTime": true, + "query": "exceptions\r\n| where $__timeFilter(timestamp) and problemId !contains \"SpaDefaultPageMiddleware\"\r\n| summarize Exceptions=count() by bin(timestamp, $__interval)\r\n| order by timestamp asc", + "resources": [ + "[parameter(product-construction-service-appinsights-resourcepath)]" + ], + "resultFormat": "time_series", + "workspace": "[parameter(default-workspace-resourcepath)]" + }, + "azureMonitor": { + "dimensionFilters": [], + "timeGrain": "auto" + }, + "datasource": { + "type": "grafana-azure-monitor-datasource", + "uid": "F2XodEi7z" + }, + "queryType": "Azure Log Analytics", + "refId": "A", + "subscription": "[parameter(dotnet-eng-appinsights-subscriptionid)]", + "subscriptions": [ + "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", + "cab65fc3-d077-467d-931f-3932eabf36d3" + ] + } + }, + { + "refId": "B", + "queryType": "", + "datasourceUid": "-100", + "relativeTimeRange": { + "from": 300, + "to": 0 + }, + "model": { + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "A", + "reducer": "mean", + "refId": "B", + "type": "reduce" + } + }, + { + "refId": "C", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [15], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": ["B"] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "B", + "refId": "C", + "type": "threshold" + } + } + ], + "noDataState": "OK", + "execErrState": "KeepLast", + "for": "5m", + "frequency": "1m", + "annotations": { + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "46", + "description": "[!IMPORTANT]\n[Description and instructions for this alert](https://dev.azure.com/dnceng/internal/_wiki/wikis/DNCEng%20Services%20Wiki/1311/-Alert-PCS-Exceptions-High)\n\nThe PCS background work items started to fail frequently.\n\n@dotnet/prodconsvcs" + }, + "labels": { + "NotificationId": "08f669cc1c3d325ee488666464" + }, + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "46", + "folderUID": "arcade-services", + "ruleGroup": "PCS Alerts", + "intervalMs": 900000, + "isPaused": false, + "notification_settings": { + "receiver": ".NET Status Alert", + "group_wait": "5m", + "repeat_interval": "4h" + } +} diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-git-push-success-rate.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-git-push-success-rate.alert.json new file mode 100644 index 000000000..25d6b096c --- /dev/null +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-git-push-success-rate.alert.json @@ -0,0 +1,113 @@ +{ + "uid": "pcs-git-push-success-rate", + "title": "Git Push success rate alert", + "condition": "C", + "data": [ + { + "refId": "A", + "queryType": "Azure Log Analytics", + "datasourceUid": "F2XodEi7z", + "model": { + "azureLogAnalytics": { + "dashboardTime": true, + "query": "customEvents\r\n| where name == \"GitPush\" and $__timeFilter(timestamp)\r\n| extend Success = tobool(customDimensions[\"Success\"])\r\n| summarize SuccessRate=100*countif(Success == true)/count() by bin(timestamp, $__interval)\r\n| order by timestamp asc", + "resources": [ + "[parameter(product-construction-service-appinsights-resourcepath)]" + ], + "resultFormat": "time_series", + "workspace": "[parameter(product-construction-service-workspace-resourcepath)]" + }, + "azureMonitor": { + "dimensionFilters": [], + "timeGrain": "auto" + }, + "datasource": { + "type": "grafana-azure-monitor-datasource", + "uid": "F2XodEi7z" + }, + "queryType": "Azure Log Analytics", + "refId": "A", + "subscription": "[parameter(dotnet-product-construction-services-subscriptionid)]", + "subscriptions": [ + "fbd6122a-9ad3-42e4-976e-bccb82486856", + "e6b5f9f5-0ca4-4351-879b-014d78400ec2" + ] + }, + "relativeTimeRange": { + "from": 300, + "to": 0 + } + }, + { + "refId": "B", + "queryType": "", + "datasourceUid": "-100", + "model": { + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "A", + "reducer": "mean", + "refId": "B", + "type": "reduce" + }, + "relativeTimeRange": { + "from": 300, + "to": 0 + } + }, + { + "refId": "C", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [80], + "type": "lt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": ["B"] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "B", + "refId": "C", + "type": "threshold" + } + } + ], + "noDataState": "KeepLast", + "execErrState": "KeepLast", + "for": "5m", + "frequency": "1m", + "annotations": { + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "75", + "description": "[!IMPORTANT]\\n[Description and instructions for this alert](https://dev.azure.com/dnceng/internal/_wiki/wikis/DNCEng%20Services%20Wiki/1318/-Alert-PCS-high-git-push-failure-rate)\\n\\nPCS has a high `git push` failure rate, please investigate\\n\\n@dotnet/prodconsvcs" + }, + "labels": { + "NotificationId": "6ggqnvwrunnru1zfl4g42dn9qjzanb8a" + }, + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "75", + "folderUID": "arcade-services", + "ruleGroup": "PCS Alerts", + "intervalSeconds": 60, + "isPaused": false, + "notification_settings": { + "receiver": ".NET Status Alert", + "group_wait": "5m", + "repeat_interval": "4h" + } +} diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-work-item-success-rate.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-work-item-success-rate.alert.json new file mode 100644 index 000000000..a059858b8 --- /dev/null +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/pcs-work-item-success-rate.alert.json @@ -0,0 +1,153 @@ +{ + "uid": "pcs-work-item-success-rate", + "title": "PCS Work Item Success Rate alert", + "condition": "D", + "data": [ + { + "refId": "A", + "queryType": "Azure Log Analytics", + "datasourceUid": "F2XodEi7z", + "relativeTimeRange": { + "from": 300, + "to": 0 + }, + "model": { + "azureLogAnalytics": { + "dashboardTime": true, + "query": "customEvents\r\n| where name == \"WorkItemExecuted\" and $__timeFilter(timestamp)\r\n| extend Success = tobool(customDimensions[\"Success\"])\r\n| summarize Successful = countif(Success == true), Failed = countif(Success == false) by bin(timestamp, $__interval)\r\n| order by timestamp asc", + "resources": [ + "[parameter(product-construction-service-appinsights-resourcepath)]" + ], + "resultFormat": "time_series", + "workspace": "[parameter(default-workspace-resourcepath)]" + }, + "azureMonitor": { + "dimensionFilters": [], + "timeGrain": "auto" + }, + "datasource": { + "type": "grafana-azure-monitor-datasource", + "uid": "F2XodEi7z" + }, + "queryType": "Azure Log Analytics", + "refId": "A", + "subscription": "[parameter(dotnet-eng-appinsights-subscriptionid)]", + "subscriptions": [ + "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", + "cab65fc3-d077-467d-931f-3932eabf36d3" + ] + } + }, + { + "refId": "B", + "queryType": "Azure Log Analytics", + "datasourceUid": "F2XodEi7z", + "model": { + "azureLogAnalytics": { + "dashboardTime": true, + "query": "customEvents\r\n| where name == \"WorkItemExecuted\" and $__timeFilter(timestamp)\r\n| extend Success = tobool(customDimensions[\"Success\"])\r\n| summarize SuccessRate=100*countif(Success == true)/count() by bin(timestamp, $__interval)\r\n| order by timestamp asc", + "resources": [ + "[parameter(product-construction-service-appinsights-resourcepath)]" + ], + "resultFormat": "time_series", + "workspace": "[parameter(default-workspace-resourcepath)]" + }, + "azureMonitor": { + "dimensionFilters": [], + "timeGrain": "auto" + }, + "datasource": { + "type": "grafana-azure-monitor-datasource", + "uid": "F2XodEi7z" + }, + "hide": false, + "queryType": "Azure Log Analytics", + "refId": "B", + "subscription": "[parameter(dotnet-eng-appinsights-subscriptionid)]", + "subscriptions": [ + "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", + "cab65fc3-d077-467d-931f-3932eabf36d3" + ] + }, + "relativeTimeRange": { + "from": 300, + "to": 0 + } + }, + { + "refId": "C", + "queryType": "", + "datasourceUid": "-100", + "relativeTimeRange": { + "from": 300, + "to": 0 + }, + "model": { + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "B", + "reducer": "mean", + "refId": "C", + "type": "reduce" + } + }, + { + "refId": "D", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [ + 74 + ], + "type": "lt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "C" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "C", + "refId": "D", + "type": "threshold" + } + } + ], + "noDataState": "KeepLast", + "execErrState": "KeepLast", + "for": "5m", + "frequency": "1m", + "annotations": { + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "64", + "description": "[!IMPORTANT]\n[Description and instructions for this alert](https://dev.azure.com/dnceng/internal/_wiki/wikis/DNCEng%20Services%20Wiki/1310/-Alert-PCS-Work-Item-Success-Rate-alert)\n\nThe PCS background work items started to fail frequently.\n\n@dotnet/prodconsvcs" + }, + "labels": { + "NotificationId": "d71fe025a8954b6cad9866354ca041ee" + }, + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "64", + "folderUID": "arcade-services", + "ruleGroup": "PCS Alerts", + "intervalMs": 900000, + "isPaused": false, + "notification_settings": { + "receiver": ".NET Status Alert", + "group_wait": "5m", + "repeat_interval": "4h" + } +} \ No newline at end of file diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/quota-eastus.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/quota-eastus.alert.json new file mode 100644 index 000000000..8e0db517e --- /dev/null +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/quota-eastus.alert.json @@ -0,0 +1,116 @@ +{ + "uid": "quota-eastus", + "title": "Azure quota usage for east us", + "condition": "C", + "data": [ + { + "refId": "A", + "queryType": "Azure Log Analytics", + "datasourceUid": "F2XodEi7z", + "model": { + "azureLogAnalytics": { + "query": "customEvents\n| where $__timeFilter(timestamp)\n| where name == 'AzureResourceUsage'\n| project\n timestamp,\n resource = tostring(customDimensions.name),\n location = tostring(customDimensions.location),\n current = toreal(customMeasurements.current),\n ['limit'] = toreal(customMeasurements.['limit']),\n subscription = tostring(customDimensions.subscription)\n| where resource != 'NetworkWatchers'\n| where location == 'eastus'\n| where subscription in (\"cab65fc3-d077-467d-931f-3932eabf36d3\", \"68672ab8-de0c-40f1-8d1b-ffb20bd62c0f\")\n| extend ['percent of limit'] = iff(['limit'] == 0, 0.0, current/['limit'] * 100)\n| summarize ['percent of limit'] = max(['percent of limit']) by bin(timestamp, $__interval), resource\n| order by timestamp asc, resource", + "resource": "[parameter(dotnet-eng-appinsights-resourcepath)]", + "resultFormat": "time_series", + "workspace": "[parameter(default-workspace-resourcepath)]" + }, + "azureMonitor": { + "dimensionFilter": "*", + "dimensionFilters": [], + "timeGrain": "auto", + "top": "10" + }, + "datasource": { + "type": "grafana-azure-monitor-datasource", + "uid": "F2XodEi7z" + }, + "queryType": "Azure Log Analytics", + "refId": "A", + "subscription": "[parameter(dotnet-eng-appinsights-subscriptionid)]", + "subscriptions": [ + "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", + "cab65fc3-d077-467d-931f-3932eabf36d3" + ] + }, + "relativeTimeRange": { + "from": 300, + "to": 0 + } + }, + { + "refId": "B", + "queryType": "", + "datasourceUid": "-100", + "model": { + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "A", + "reducer": "mean", + "refId": "B", + "type": "reduce" + }, + "relativeTimeRange": { + "from": 300, + "to": 0 + } + }, + { + "refId": "C", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [ + 95 + ], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "B" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "B", + "refId": "C", + "type": "threshold" + } + } + ], + "noDataState": "KeepLast", + "execErrState": "KeepLast", + "for": "5m", + "frequency": "1m", + "annotations": { + "__dashboardUid__": "quota", + "__panelId__": "24", + "description": "An Azure Resource Quota is nearing its limit in region eastus!" + }, + "labels": { + "NotificationId": "b50b57fa7d1840438da5232711af4485" + }, + "__dashboardUid__": "quota", + "__panelId__": "24", + "folderUID": "arcade-services", + "ruleGroup": "Azure Quota Alerts", + "intervalMs": 900000, + "isPaused": false, + "notification_settings": { + "receiver": ".NET Status Alert", + "group_wait": "5m", + "repeat_interval": "4h" + } +} \ No newline at end of file diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/quota-westus.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/quota-westus.alert.json new file mode 100644 index 000000000..cba370182 --- /dev/null +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/quota-westus.alert.json @@ -0,0 +1,116 @@ +{ + "uid": "quota-westus", + "title": "Azure quota usage for west us", + "condition": "C", + "data": [ + { + "refId": "A", + "queryType": "Azure Log Analytics", + "datasourceUid": "F2XodEi7z", + "model": { + "azureLogAnalytics": { + "query": "customEvents\n| where $__timeFilter(timestamp)\n| where name == 'AzureResourceUsage'\n| project\n timestamp,\n resource = tostring(customDimensions.name),\n location = tostring(customDimensions.location),\n current = toreal(customMeasurements.current),\n ['limit'] = toreal(customMeasurements.['limit']),\n subscription = tostring(customDimensions.subscription)\n| where resource != 'NetworkWatchers'\n| where location == 'westus'\n| where subscription in (\"cab65fc3-d077-467d-931f-3932eabf36d3\", \"68672ab8-de0c-40f1-8d1b-ffb20bd62c0f\")\n| extend ['percent of limit'] = iff(['limit'] == 0, 0.0, current/['limit'] * 100)\n| summarize ['percent of limit'] = max(['percent of limit']) by bin(timestamp, $__interval), resource\n| order by timestamp asc, resource", + "resource": "[parameter(dotnet-eng-appinsights-resourcepath)]", + "resultFormat": "time_series", + "workspace": "[parameter(default-workspace-resourcepath)]" + }, + "azureMonitor": { + "dimensionFilter": "*", + "dimensionFilters": [], + "timeGrain": "auto", + "top": "10" + }, + "datasource": { + "type": "grafana-azure-monitor-datasource", + "uid": "F2XodEi7z" + }, + "queryType": "Azure Log Analytics", + "refId": "A", + "subscription": "[parameter(dotnet-eng-appinsights-subscriptionid)]", + "subscriptions": [ + "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", + "cab65fc3-d077-467d-931f-3932eabf36d3" + ] + }, + "relativeTimeRange": { + "from": 300, + "to": 0 + } + }, + { + "refId": "B", + "queryType": "", + "datasourceUid": "-100", + "model": { + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "A", + "reducer": "mean", + "refId": "B", + "type": "reduce" + }, + "relativeTimeRange": { + "from": 300, + "to": 0 + } + }, + { + "refId": "C", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [ + 95 + ], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "B" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "B", + "refId": "C", + "type": "threshold" + } + } + ], + "noDataState": "KeepLast", + "execErrState": "KeepLast", + "for": "5m", + "frequency": "1m", + "annotations": { + "__dashboardUid__": "quota", + "__panelId__": "12", + "description": "An Azure Resource Quota is nearing its limit in region westus!" + }, + "labels": { + "NotificationId": "e2be2ec3e22e46d28730bab54ff8fa77" + }, + "__dashboardUid__": "quota", + "__panelId__": "12", + "folderUID": "arcade-services", + "ruleGroup": "Azure Quota Alerts", + "intervalMs": 900000, + "isPaused": false, + "notification_settings": { + "receiver": ".NET Status Alert", + "group_wait": "5m", + "repeat_interval": "4h" + } +} \ No newline at end of file diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/quota-westus2.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/quota-westus2.alert.json new file mode 100644 index 000000000..cc70acc06 --- /dev/null +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/quota-westus2.alert.json @@ -0,0 +1,112 @@ +{ + "uid": "quota-westus2", + "title": "Azure quota usage for west us 2", + "condition": "C", + "data": [ + { + "refId": "A", + "queryType": "Azure Log Analytics", + "datasourceUid": "F2XodEi7z", + "model": { + "azureLogAnalytics": { + "query": "customEvents\n| where $__timeFilter(timestamp)\n| where name == 'AzureResourceUsage'\n| project\n timestamp,\n resource = tostring(customDimensions.name),\n location = tostring(customDimensions.location),\n current = toreal(customMeasurements.current),\n ['limit'] = toreal(customMeasurements.['limit']),\n subscription = tostring(customDimensions.subscription)\n| where resource != 'NetworkWatchers' and resource != \"standardDASv4Family\"\n| where location == 'westus2'\n| where subscription in (\"cab65fc3-d077-467d-931f-3932eabf36d3\", \"68672ab8-de0c-40f1-8d1b-ffb20bd62c0f\")\n| extend ['percent of limit'] = iff(['limit'] == 0, 0.0, current/['limit'] * 100)\n| summarize ['percent of limit'] = max(['percent of limit']) by bin(timestamp, $__interval), resource\n| order by timestamp asc, resource", + "resource": "[parameter(dotnet-eng-appinsights-resourcepath)]", + "resultFormat": "time_series", + "workspace": "[parameter(default-workspace-resourcepath)]" + }, + "azureMonitor": { + "dimensionFilter": "*", + "dimensionFilters": [], + "timeGrain": "auto", + "top": "10" + }, + "datasource": { + "type": "grafana-azure-monitor-datasource", + "uid": "F2XodEi7z" + }, + "queryType": "Azure Log Analytics", + "refId": "A", + "subscription": "[parameter(dotnet-eng-appinsights-subscriptionid)]", + "subscriptions": [ + "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", + "cab65fc3-d077-467d-931f-3932eabf36d3" + ] + }, + "relativeTimeRange": { + "from": 600, + "to": 0 + } + }, + { + "refId": "B", + "queryType": "", + "datasourceUid": "-100", + "model": { + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "A", + "reducer": "mean", + "refId": "B", + "type": "reduce" + }, + "relativeTimeRange": { + "from": 600, + "to": 0 + } + }, + { + "refId": "C", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [95], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": ["B"] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "B", + "refId": "C", + "type": "threshold" + } + } + ], + "noDataState": "KeepLast", + "execErrState": "KeepLast", + "for": "5m", + "frequency": "1m", + "annotations": { + "__dashboardUid__": "quota", + "__panelId__": "13", + "description": "An Azure Resource Quota is nearing its limit in region westus2!" + }, + "labels": { + "NotificationId": "44aff3c937c042caa09f821ae923c26c" + }, + "__dashboardUid__": "quota", + "__panelId__": "13", + "folderUID": "arcade-services", + "ruleGroup": "Azure Quota Alerts", + "intervalMs": 900000, + "isPaused": false, + "notification_settings": { + "receiver": ".NET Status Alert", + "group_wait": "5m", + "repeat_interval": "4h" + } +} diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/source-dot-net-availability.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/source-dot-net-availability.alert.json new file mode 100644 index 000000000..e56a3241c --- /dev/null +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/source-dot-net-availability.alert.json @@ -0,0 +1,138 @@ +{ + "uid": "source-dot-net-availability", + "title": "source.dot.net Availability", + "condition": "C", + "data": [ + { + "refId": "A", + "queryType": "Azure Monitor", + "datasourceUid": "F2XodEi7z", + "relativeTimeRange": { + "from": 300, + "to": 0 + }, + "model": { + "azureMonitor": { + "aggregation": "Average", + "alias": "", + "allowedTimeGrainsMs": [ + 60000, + 300000, + 900000, + 1800000, + 3600000, + 21600000, + 43200000, + 86400000 + ], + "dimensionFilters": [ + { + "dimension": "availabilityResult/name", + "filter": "source-dot-net", + "operator": "eq" + } + ], + "metricDefinition": "Microsoft.Insights/components", + "metricName": "availabilityResults/availabilityPercentage", + "metricNamespace": "microsoft.insights/components", + "resources": [ + { + "metricNamespace": "microsoft.insights/components", + "resourceGroup": "dotnet-eng-cluster", + "resourceName": "dotnet-eng", + "subscription": "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f" + } + ], + "timeGrain": "auto", + "top": "10" + }, + "datasource": { + "type": "grafana-azure-monitor-datasource", + "uid": "F2XodEi7z" + }, + "queryType": "Azure Monitor", + "refId": "A", + "subscription": "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", + "subscriptions": [ + "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", + "cab65fc3-d077-467d-931f-3932eabf36d3" + ] + } + }, + { + "refId": "B", + "queryType": "", + "datasourceUid": "-100", + "model": { + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "A", + "reducer": "mean", + "refId": "B", + "type": "reduce" + }, + "relativeTimeRange": { + "from": 300, + "to": 0 + } + }, + { + "refId": "C", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [ + 60 + ], + "type": "lt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "B" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "B", + "refId": "C", + "type": "threshold" + } + } + ], + "noDataState": "KeepLast", + "execErrState": "KeepLast", + "for": "15m", + "frequency": "1m", + "annotations": { + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "16", + "description": "source.dot.net availability is low!" + }, + "labels": { + "NotificationId": "fb8faaf7600740f98a1c2db076cd1712" + }, + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "16", + "folderUID": "arcade-services", + "ruleGroup": "Source Browser Alerts", + "intervalSeconds": 900000, + "isPaused": false, + "notification_settings": { + "receiver": ".NET Status Alert", + "group_wait": "5m", + "repeat_interval": "4h" + } +} \ No newline at end of file diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/work-items-waiting-time-build-pools.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/work-items-waiting-time-build-pools.alert.json new file mode 100644 index 000000000..a4d269995 --- /dev/null +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/work-items-waiting-time-build-pools.alert.json @@ -0,0 +1,152 @@ +{ + "uid": "work-items-waiting-time-build-pools", + "title": "Work Items Waiting Time Is Too High (Build Pools)", + "condition": "D", + "data": [ + { + "refId": "A", + "queryType": "", + "datasourceUid": "OlcfOPi7z", + "relativeTimeRange": { + "from": 300, + "to": 0 + }, + "model": { + "database": "engineeringdata", + "datasource": { + "type": "grafana-azure-data-explorer-datasource", + "uid": "OlcfOPi7z" + }, + "expression": { + "groupBy": { + "expressions": [], + "type": "and" + }, + "reduce": { + "expressions": [], + "type": "and" + }, + "where": { + "expressions": [], + "type": "and" + } + }, + "query": "WorkItems \n| where $__timeFilter(Queued)\n| project QueueName = tolower(QueueName), duration = datetime_diff('second', Started, Queued), Queued\n| where QueueName !contains \"osx\" and QueueName !contains \"perf\" and QueueName !contains \"arm\" and QueueName !contains \"arcade\" and QueueName !contains \"xaml\" and QueueName !contains \"appcompat\"\n| where QueueName contains \"buildpool\"\n| summarize Percentile95 = percentile(duration, 95) / 60 by bin(Queued, 1d)\n| order by Queued asc ", + "querySource": "raw", + "rawMode": true, + "refId": "A", + "resultFormat": "table" + } + }, + { + "refId": "B", + "queryType": "", + "datasourceUid": "OlcfOPi7z", + "relativeTimeRange": { + "from": 300, + "to": 0 + }, + "model": { + "database": "engineeringdata", + "datasource": { + "type": "grafana-azure-data-explorer-datasource", + "uid": "OlcfOPi7z" + }, + "expression": { + "groupBy": { + "expressions": [], + "type": "and" + }, + "reduce": { + "expressions": [], + "type": "and" + }, + "where": { + "expressions": [], + "type": "and" + } + }, + "query": "WorkItems \n| where $__timeFilter(Queued)\n| project QueueName = tolower(QueueName), duration = datetime_diff('second', Started, Queued), Queued\n| where QueueName !contains \"osx\" and QueueName !contains \"perf\" and QueueName !contains \"arm\" and QueueName !contains \"arcade\" and QueueName !contains \"xaml\" and QueueName !contains \"appcompat\"\n| where QueueName contains \"buildpool\"\n| summarize Percentile50 = percentile(duration, 50) / 60 by bin(Queued, 1d)\n| order by Queued asc ", + "querySource": "raw", + "rawMode": true, + "refId": "B", + "resultFormat": "table" + } + }, + { + "refId": "C", + "queryType": "", + "datasourceUid": "-100", + "model": { + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "B", + "reducer": "min", + "refId": "C", + "type": "reduce" + }, + "relativeTimeRange": { + "from": 300, + "to": 0 + } + }, + { + "refId": "D", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [ + 30 + ], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "C" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "C", + "refId": "D", + "type": "threshold" + } + } + ], + "noDataState": "OK", + "execErrState": "Alerting", + "for": "5m", + "frequency": "5m", + "annotations": { + "__dashboardUid__": "home", + "__panelId__": "4", + "description": "95 percentile of work item waiting times is over 30 minutes. BuildPool queues only." + }, + "labels": { + "NotificationId": "work-items-waiting-time-build-pools" + }, + "__dashboardUid__": "home", + "__panelId__": "4", + "folderUID": "arcade-services", + "ruleGroup": "Helix Queue Alerts", + "intervalMs": 300000, + "isPaused": false, + "notification_settings": { + "receiver": ".NET Status Alert", + "group_wait": "5m", + "repeat_interval": "4h" + } +} \ No newline at end of file diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/work-items-waiting-time-test-queues.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/work-items-waiting-time-test-queues.alert.json new file mode 100644 index 000000000..f8d4292b3 --- /dev/null +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Production/work-items-waiting-time-test-queues.alert.json @@ -0,0 +1,152 @@ +{ + "uid": "work-items-waiting-time-test-queues", + "title": "Work Items Waiting Time Is Too High (Test Queues)", + "condition": "D", + "data": [ + { + "refId": "A", + "queryType": "", + "datasourceUid": "OlcfOPi7z", + "relativeTimeRange": { + "from": 300, + "to": 0 + }, + "model": { + "database": "engineeringdata", + "datasource": { + "type": "grafana-azure-data-explorer-datasource", + "uid": "OlcfOPi7z" + }, + "expression": { + "groupBy": { + "expressions": [], + "type": "and" + }, + "reduce": { + "expressions": [], + "type": "and" + }, + "where": { + "expressions": [], + "type": "and" + } + }, + "query": "WorkItems \n| where $__timeFilter(Queued)\n| project QueueName = tolower(QueueName), duration = datetime_diff('second', Started, Queued), Queued\n| where QueueName !contains \"osx\" and QueueName !contains \"perf\" and QueueName !contains \"arm\" and QueueName !contains \"arcade\" and QueueName !contains \"xaml\" and QueueName !contains \"appcompat\" and QueueName !contains \".tof\"\n| where QueueName !contains \"buildpool\"\n| summarize Percentile95 = percentile(duration, 95) / 60 by bin(Queued, 1d)\n| order by Queued asc ", + "querySource": "raw", + "rawMode": true, + "refId": "A", + "resultFormat": "table" + } + }, + { + "refId": "B", + "queryType": "", + "datasourceUid": "OlcfOPi7z", + "relativeTimeRange": { + "from": 300, + "to": 0 + }, + "model": { + "database": "engineeringdata", + "datasource": { + "type": "grafana-azure-data-explorer-datasource", + "uid": "OlcfOPi7z" + }, + "expression": { + "groupBy": { + "expressions": [], + "type": "and" + }, + "reduce": { + "expressions": [], + "type": "and" + }, + "where": { + "expressions": [], + "type": "and" + } + }, + "query": "WorkItems \n| where $__timeFilter(Queued)\n| project QueueName = tolower(QueueName), duration = datetime_diff('second', Started, Queued), Queued\n| where QueueName !contains \"osx\" and QueueName !contains \"perf\" and QueueName !contains \"arm\" and QueueName !contains \"arcade\" and QueueName !contains \"xaml\" and QueueName !contains \"appcompat\"\n| where QueueName !contains \"buildpool\"\n| summarize Percentile50 = percentile(duration, 50) / 60 by bin(Queued, 1d)\n| order by Queued asc ", + "querySource": "raw", + "rawMode": true, + "refId": "B", + "resultFormat": "table" + } + }, + { + "refId": "C", + "queryType": "", + "datasourceUid": "-100", + "model": { + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "A", + "reducer": "min", + "refId": "C", + "type": "reduce" + }, + "relativeTimeRange": { + "from": 300, + "to": 0 + } + }, + { + "refId": "D", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [ + 35 + ], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "C" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "C", + "refId": "D", + "type": "threshold" + } + } + ], + "noDataState": "OK", + "execErrState": "Alerting", + "for": "5m", + "frequency": "5m", + "annotations": { + "__dashboardUid__": "home", + "__panelId__": "10", + "description": "95 percentile of work item waiting times is over 35 minutes. Test queues only." + }, + "labels": { + "NotificationId": "work-items-waiting-time-test-queues" + }, + "__dashboardUid__": "home", + "__panelId__": "10", + "folderUID": "arcade-services", + "ruleGroup": "Helix Queue Alerts", + "intervalMs": 300000, + "isPaused": false, + "notification_settings": { + "receiver": ".NET Status Alert", + "group_wait": "5m", + "repeat_interval": "4h" + } +} diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/cores-consumption.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/cores-consumption.alert.json new file mode 100644 index 000000000..57705e756 --- /dev/null +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/cores-consumption.alert.json @@ -0,0 +1,118 @@ +{ + "uid": "cores-consumption", + "title": "Cores consumption", + "condition": "C", + "data": [ + { + "refId": "A", + "queryType": "Azure Log Analytics", + "datasourceUid": "F2XodEi7z", + "relativeTimeRange": { + "from": 300, + "to": 0 + }, + "model": { + "azureLogAnalytics": { + "query": "let quotaPerSubscription = customEvents \n| where $__timeFilter(timestamp)\n| where name == \"AzureSubscriptionQuotaLimit\"\n| project \n quota = toint(customMeasurements.quota),\n subscription = tostring(customDimensions.subscriptionId),\n timestamp\n| summarize arg_max(timestamp, quota) by subscription\n| project quota, subscription;\ncustomEvents\n| where $__timeFilter(timestamp)\n| where name == 'AzureResourceUsage'\n| where customDimensions.name == \"standardDv3Family\" or customDimensions.name == \"standardDAv4Family\"\n| project \n cores = toreal(customMeasurements.current),\n subscription = tostring(customDimensions.subscription),\n timestamp\n| join kind=inner quotaPerSubscription on subscription\n| project ['limit'] = quota, cores, timestamp, subscription\n| extend ['percent of limit'] = iff(['limit'] == 0, 0.0, cores/['limit'] * 100)\n| summarize ['percent of limit'] = max(['percent of limit']) by bin(timestamp, $__interval), subscription\n| order by timestamp asc", + "resources": [ + "[parameter(dotnet-eng-appinsights-resourcepath)]" + ], + "resultFormat": "time_series", + "workspace": "[parameter(default-workspace-resourcepath)]" + }, + "azureMonitor": { + "dimensionFilter": "*", + "dimensionFilters": [], + "timeGrain": "auto", + "top": "10" + }, + "datasource": { + "type": "grafana-azure-monitor-datasource", + "uid": "F2XodEi7z" + }, + "queryType": "Azure Log Analytics", + "refId": "A", + "subscription": "[parameter(dotnet-eng-appinsights-subscriptionid)]", + "subscriptions": [ + "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", + "cab65fc3-d077-467d-931f-3932eabf36d3" + ] + } + }, + { + "refId": "B", + "queryType": "", + "datasourceUid": "-100", + "model": { + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "A", + "reducer": "mean", + "refId": "B", + "type": "reduce" + }, + "relativeTimeRange": { + "from": 300, + "to": 0 + } + }, + { + "refId": "C", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [ + 95 + ], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "B" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "B", + "refId": "C", + "type": "threshold" + } + } + ], + "noDataState": "KeepLast", + "execErrState": "KeepLast", + "for": "5m", + "frequency": "1m", + "annotations": { + "__dashboardUid__": "quota", + "__panelId__": "15", + "description": "Cores consumption by Autoscaler is above 95% of limit" + }, + "labels": { + "NotificationId": "66b2ef8da5c74a2fbbc7d6739f55e4e8" + }, + "__dashboardUid__": "quota", + "__panelId__": "15", + "folderUID": "arcade-services", + "ruleGroup": "Azure Quota Alerts", + "intervalMs": 900000, + "isPaused": false, + "notification_settings": { + "receiver": ".NET Status Alert", + "group_wait": "5m", + "repeat_interval": "4h" + } +} \ No newline at end of file diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/dotneteng-status-failed-requests.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/dotneteng-status-failed-requests.alert.json new file mode 100644 index 000000000..0ec5bb980 --- /dev/null +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/dotneteng-status-failed-requests.alert.json @@ -0,0 +1,119 @@ +{ + "uid": "dotneteng-status-failed-requests", + "title": "DotNetEng Status Failed Requests/Hour alert", + "condition": "C", + "data": [ + { + "refId": "A", + "queryType": "Azure Log Analytics", + "datasourceUid": "F2XodEi7z", + "model": { + "azureLogAnalytics": { + "dashboardTime": true, + "query": "let r = requests | where $__timeFilter(timestamp);\nlet f = coalesce(toscalar(r | summarize min(timestamp)), ago(1d));\nlet t = coalesce(toscalar(r | summarize max(timestamp)), now());\nlet span=(t - f)/60;\nlet interval=case(span >= 1d, bin(span, 1d), span >= 1h, bin(span, 1h), 15m);\nlet intervalHours = interval / 1h;\nr\n| where success == false\n| make-series kind=nonempty valueCount=count() default=0 on timestamp in range(f, t, interval)\n| mv-expand timestamp to typeof(datetime), valueCount to typeof(double)\n| project timestamp, failuresCount=valueCount/intervalHours", + "resources": [ + "/subscriptions/68672ab8-de0c-40f1-8d1b-ffb20bd62c0f/resourceGroups/monitoring/providers/microsoft.insights/components/DotNetEng-Status-Prod" + ], + "resultFormat": "time_series", + "workspace": "/subscriptions/68672ab8-de0c-40f1-8d1b-ffb20bd62c0f/resourcegroups/defaultresourcegroup-eus/providers/microsoft.operationalinsights/workspaces/defaultworkspace-68672ab8-de0c-40f1-8d1b-ffb20bd62c0f-eus" + }, + "azureMonitor": { + "dimensionFilter": "*", + "dimensionFilters": [], + "timeGrain": "auto", + "top": "10" + }, + "datasource": { + "type": "grafana-azure-monitor-datasource", + "uid": "F2XodEi7z" + }, + "queryType": "Azure Log Analytics", + "refId": "A", + "subscription": "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", + "subscriptions": [ + "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", + "cab65fc3-d077-467d-931f-3932eabf36d3" + ] + }, + "relativeTimeRange": { + "from": 3600, + "to": 600 + } + }, + { + "refId": "B", + "queryType": "", + "datasourceUid": "-100", + "model": { + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "A", + "reducer": "mean", + "refId": "B", + "type": "reduce" + }, + "relativeTimeRange": { + "from": 3600, + "to": 600 + } + }, + { + "refId": "C", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [ + 20 + ], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "B" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "B", + "refId": "C", + "type": "threshold" + } + } + ], + "noDataState": "OK", + "execErrState": "KeepLast", + "for": "5m", + "frequency": "1m", + "annotations": { + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "44", + "description": "The number of failed DotNetEng Status requests per hour is above 20. This may indicate a systemic problem that needs to be investigated.\\nTo intially investigate prod, run the following query in DotNetEng-Status-Prod, and to investigate staging, run the query in DotNetEng-Status-Staging:\\n\\n```\\nunion exceptions, traces\\n| project timestamp, operation_Name, customDimensions, message, problemId, details\\n| order by timestamp asc\\n```" + }, + "labels": { + "NotificationId": "d2dd705a6c724ed68fcf6955561c06dd" + }, + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "44", + "folderUID": "arcade-services", + "ruleGroup": "DotNetEng Status Alerts", + "intervalMs": 900000, + "isPaused": false, + "notification_settings": { + "receiver": ".NET Status Alert", + "group_wait": "5m", + "repeat_interval": "4h" + } +} \ No newline at end of file diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/helix-api-availability.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/helix-api-availability.alert.json new file mode 100644 index 000000000..7923c81d5 --- /dev/null +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/helix-api-availability.alert.json @@ -0,0 +1,144 @@ +{ + "uid": "helix-api-availability", + "title": "Helix API availability", + "condition": "C", + "data": [ + { + "refId": "A", + "queryType": "Azure Monitor", + "datasourceUid": "F2XodEi7z", + "relativeTimeRange": { + "from": 1800, + "to": 0 + }, + "model": { + "azureMonitor": { + "aggregation": "Average", + "alias": "{{ availabilityresult/location }}", + "allowedTimeGrainsMs": [ + 60000, + 300000, + 900000, + 1800000, + 3600000, + 21600000, + 43200000, + 86400000 + ], + "dimensionFilter": "*", + "dimensionFilters": [ + { + "dimension": "availabilityResult/name", + "filter": "Helix API", + "operator": "eq" + }, + { + "dimension": "availabilityResult/location", + "filter": "*", + "operator": "eq" + } + ], + "metricDefinition": "microsoft.insights/components", + "metricName": "availabilityResults/availabilityPercentage", + "metricNamespace": "microsoft.insights/components", + "resources": [ + { + "metricNamespace": "microsoft.insights/components", + "resourceGroup": "helixinfrarg", + "resourceName": "helix-prod", + "subscription": "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f" + } + ], + "timeGrain": "auto", + "top": "10" + }, + "datasource": { + "type": "grafana-azure-monitor-datasource", + "uid": "F2XodEi7z" + }, + "queryType": "Azure Monitor", + "refId": "A", + "subscription": "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", + "subscriptions": [ + "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", + "cab65fc3-d077-467d-931f-3932eabf36d3" + ] + } + }, + { + "refId": "B", + "queryType": "", + "datasourceUid": "-100", + "model": { + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "A", + "reducer": "mean", + "refId": "B", + "type": "reduce" + }, + "relativeTimeRange": { + "from": 1800, + "to": 0 + } + }, + { + "refId": "C", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [ + 99 + ], + "type": "lt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "B" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "B", + "refId": "C", + "type": "threshold" + } + } + ], + "noDataState": "KeepLast", + "execErrState": "KeepLast", + "for": "5m", + "frequency": "1m", + "annotations": { + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "18", + "description": "Helix API availability alert!" + }, + "labels": { + "NotificationId": "6179576701874a7abc440a574cf636d0" + }, + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "18", + "folderUID": "arcade-services", + "ruleGroup": "Helix Alerts", + "intervalMs": 900000, + "isPaused": false, + "notification_settings": { + "receiver": ".NET Status Alert", + "group_wait": "5m", + "repeat_interval": "4h" + } +} \ No newline at end of file diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/helix-api-average-response-time.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/helix-api-average-response-time.alert.json new file mode 100644 index 000000000..01189871e --- /dev/null +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/helix-api-average-response-time.alert.json @@ -0,0 +1,133 @@ +{ + "uid": "helix-api-average-response-time", + "title": "Helix API Average Response Time", + "condition": "C", + "data": [ + { + "refId": "A", + "queryType": "Azure Monitor", + "datasourceUid": "F2XodEi7z", + "relativeTimeRange": { + "from": 300, + "to": 0 + }, + "model": { + "azureMonitor": { + "aggregation": "Average", + "alias": "", + "allowedTimeGrainsMs": [ + 60000, + 300000, + 900000, + 1800000, + 3600000, + 21600000, + 43200000, + 86400000 + ], + "dimensionFilter": "*", + "dimensionFilters": [], + "metricDefinition": "Microsoft.Insights/components", + "metricName": "requests/duration", + "metricNamespace": "microsoft.insights/components", + "resources": [ + { + "metricNamespace": "microsoft.insights/components", + "resourceGroup": "dotnet-eng-cluster", + "resourceName": "dotnet-eng", + "subscription": "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f" + } + ], + "timeGrain": "auto", + "top": "10" + }, + "datasource": { + "type": "grafana-azure-monitor-datasource", + "uid": "F2XodEi7z" + }, + "queryType": "Azure Monitor", + "refId": "A", + "subscription": "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", + "subscriptions": [ + "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", + "cab65fc3-d077-467d-931f-3932eabf36d3" + ] + } + }, + { + "refId": "B", + "queryType": "", + "datasourceUid": "-100", + "model": { + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "A", + "reducer": "mean", + "refId": "B", + "type": "reduce" + }, + "relativeTimeRange": { + "from": 300, + "to": 0 + } + }, + { + "refId": "C", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [ + 5000 + ], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "B" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "B", + "refId": "C", + "type": "threshold" + } + } + ], + "noDataState": "KeepLast", + "execErrState": "KeepLast", + "for": "5m", + "frequency": "1m", + "annotations": { + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "19", + "description": "Helix API Average Response Time is high!" + }, + "labels": { + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "19", + "NotificationId": "24cae10d9eca44079e7cf3d47f148497" + }, + "folderUID": "arcade-services", + "ruleGroup": "Helix Alerts", + "intervalMs": 900000, + "isPaused": false, + "notification_settings": { + "receiver": ".NET Status Alert", + "group_wait": "5m", + "repeat_interval": "4h" + } +} diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/helix-autoscaler-service-stopped.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/helix-autoscaler-service-stopped.alert.json new file mode 100644 index 000000000..0ed88673b --- /dev/null +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/helix-autoscaler-service-stopped.alert.json @@ -0,0 +1,141 @@ +{ + "uid": "helix-autoscaler-service-stopped", + "title": "Helix AutoScaler Service Stopped Running", + "condition": "C", + "data": [ + { + "refId": "A", + "queryType": "Azure Monitor", + "datasourceUid": "F2XodEi7z", + "relativeTimeRange": { + "from": 1800, + "to": 0 + }, + "model": { + "azureMonitor": { + "aggregation": "Count", + "alias": "{{cloud/RoleName}}", + "allowedTimeGrainsMs": [ + 60000, + 300000, + 900000, + 1800000, + 3600000, + 21600000, + 43200000, + 86400000 + ], + "dimensionFilter": "*", + "dimensionFilters": [ + { + "dimension": "cloud/roleName", + "filter": "fabric:/Helix/AutoScaleActorService", + "operator": "eq" + } + ], + "metricDefinition": "Microsoft.Insights/components", + "metricName": "traces/count", + "metricNamespace": "microsoft.insights/components", + "region": "westus2", + "resources": [ + { + "metricNamespace": "microsoft.insights/components", + "region": "westus2", + "resourceGroup": "dotnet-eng-cluster", + "resourceName": "dotnet-eng", + "subscription": "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f" + } + ], + "timeGrain": "auto", + "top": "100" + }, + "datasource": { + "type": "grafana-azure-monitor-datasource", + "uid": "F2XodEi7z" + }, + "queryType": "Azure Monitor", + "refId": "A", + "subscription": "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", + "subscriptions": [ + "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", + "cab65fc3-d077-467d-931f-3932eabf36d3" + ] + } + }, + { + "refId": "B", + "queryType": "", + "datasourceUid": "-100", + "relativeTimeRange": { + "from": 1800, + "to": 0 + }, + "model": { + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "A", + "reducer": "mean", + "refId": "B", + "type": "reduce" + } + }, + { + "refId": "C", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [ + 1 + ], + "type": "lt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "B" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "B", + "refId": "C", + "type": "threshold" + } + } + ], + "noDataState": "Alerting", + "execErrState": "KeepLast", + "for": "5m", + "frequency": "1m", + "annotations": { + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "29", + "description": "Helix AutoScaler Service has stopped running - no traces detected in the last 30 minutes." + }, + "labels": { + "NotificationId": "6213d3c5ce9a46278343bf075798e46f" + }, + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "29", + "folderUID": "arcade-services", + "ruleGroup": "Helix Alerts", + "intervalMs": 900000, + "isPaused": false, + "notification_settings": { + "receiver": ".NET Status Alert", + "group_wait": "5m", + "repeat_interval": "4h" + } +} \ No newline at end of file diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-background-worker-stopped.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-background-worker-stopped.alert.json new file mode 100644 index 000000000..847055a55 --- /dev/null +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-background-worker-stopped.alert.json @@ -0,0 +1,145 @@ +{ + "uid": "pcs-background-worker-stopped", + "title": "PCS Background Worker Stopped", + "condition": "D", + "data": [ + { + "refId": "A", + "queryType": "Azure Log Analytics", + "datasourceUid": "F2XodEi7z", + "relativeTimeRange": { + "from": 300, + "to": 0 + }, + "model": { + "azureLogAnalytics": { + "dashboardTime": true, + "query": "customEvents\r\n| where name == \"WorkItemExecuted\" and $__timeFilter(timestamp)\r\n| extend Type=tostring(customDimensions[\"WorkItemType\"])\r\n| summarize Count=count() by bin(timestamp, $__interval), Type=replace_string(Type, \"WorkItem\", \"\")\r\n| order by timestamp asc", + "resources": [ + "[parameter(product-construction-service-appinsights-resourcepath)]" + ], + "resultFormat": "time_series", + "workspace": "[parameter(product-construction-service-workspace-resourcepath)]" + }, + "azureMonitor": { + "dimensionFilters": [], + "timeGrain": "auto" + }, + "datasource": { + "type": "grafana-azure-monitor-datasource", + "uid": "F2XodEi7z" + }, + "queryType": "Azure Log Analytics", + "refId": "A", + "subscription": "[parameter(dotnet-eng-appinsights-subscriptionid)]" + } + }, + { + "refId": "B", + "queryType": "Azure Log Analytics", + "datasourceUid": "F2XodEi7z", + "relativeTimeRange": { + "from": 300, + "to": 0 + }, + "model": { + "azureLogAnalytics": { + "dashboardTime": true, + "query": "customEvents\r\n| where name == \"WorkItemExecuted\"\r\n| summarize TotalCount=count() by bin(timestamp, $__interval)\r\n| order by timestamp asc", + "resources": [ + "[parameter(product-construction-service-appinsights-resourcepath)]" + ], + "resultFormat": "time_series", + "workspace": "[parameter(product-construction-service-workspace-resourcepath)]" + }, + "azureMonitor": { + "dimensionFilters": [], + "timeGrain": "auto" + }, + "datasource": { + "type": "grafana-azure-monitor-datasource", + "uid": "F2XodEi7z" + }, + "hide": false, + "queryType": "Azure Log Analytics", + "refId": "B", + "subscription": "[parameter(dotnet-eng-appinsights-subscriptionid)]" + } + }, + { + "refId": "C", + "queryType": "", + "datasourceUid": "-100", + "relativeTimeRange": { + "from": 300, + "to": 0 + }, + "model": { + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "B", + "reducer": "mean", + "refId": "C", + "type": "reduce" + } + }, + { + "refId": "D", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [ + 20 + ], + "type": "lt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "C" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "C", + "refId": "D", + "type": "threshold" + } + } + ], + "noDataState": "Alerting", + "execErrState": "Alerting", + "for": "5m", + "frequency": "1m", + "annotations": { + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "57", + "description": "[!IMPORTANT]\n[Description and instructions for this alert](https://dev.azure.com/dnceng/internal/_wiki/wikis/DNCEng%20Services%20Wiki/1308/-Alert-PCS-Background-Worker-Stopped)\n\nPCS appears to have stopped processing new WorkItems.\n\n@dotnet/prodconsvcs" + }, + "labels": { + "NotificationId": "23909d48866646408f669cc1c3d325ee" + }, + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "57", + "folderUID": "arcade-services", + "ruleGroup": "PCS Alerts", + "intervalMs": 900000, + "isPaused": false, + "notification_settings": { + "receiver": ".NET Status Alert", + "group_wait": "5m", + "repeat_interval": "4h" + } +} \ No newline at end of file diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-container-job-execution-failures.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-container-job-execution-failures.alert.json new file mode 100644 index 000000000..8c6054e8a --- /dev/null +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-container-job-execution-failures.alert.json @@ -0,0 +1,124 @@ +{ + "uid": "pcs-container-job-execution-failures", + "title": "Container job execution failures alert", + "condition": "C", + "data": [ + { + "refId": "A", + "queryType": "Azure Log Analytics", + "datasourceUid": "F2XodEi7z", + "relativeTimeRange": { + "from": 300, + "to": 0 + }, + "model": { + "azureLogAnalytics": { + "dashboardTime": true, + "query": "ContainerAppSystemLogs_CL\r\n| where TimeGenerated > ago(14d)\r\n| where Log_s has_any (\"has exited with status Succeeded\", \"has exited with status Failed\")\r\n| summarize arg_max(TimeGenerated, Log_s) by JobName_s\r\n| where Log_s has \"has exited with status Failed\"\r\n| project TimeGenerated,JobName=JobName_s, FailedJob=1", + "resources": [ + "[parameter(product-construction-service-workspace-resourcepath)]" + ], + "resultFormat": "time_series", + "workspace": "[parameter(product-construction-service-workspace-resourcepath)]" + }, + "azureMonitor": { + "dimensionFilters": [], + "metricNamespace": "Microsoft.OperationalInsights/workspaces", + "region": "westus2", + "resources": [ + { + "metricNamespace": "Microsoft.OperationalInsights/workspaces", + "region": "westus2", + "resourceGroup": "[parameter(product-construction-service-resourcegroup)]", + "resourceName": "[parameter(product-construction-service-workspace-resourcename)]", + "subscription": "[parameter(dotnet-product-construction-services-subscriptionid)]" + } + ], + "timeGrain": "auto" + }, + "datasource": { + "type": "grafana-azure-monitor-datasource", + "uid": "F2XodEi7z" + }, + "queryType": "Azure Log Analytics", + "refId": "A", + "subscription": "[parameter(dotnet-product-construction-services-subscriptionid)]", + "subscriptions": [ + "fbd6122a-9ad3-42e4-976e-bccb82486856", + "e6b5f9f5-0ca4-4351-879b-014d78400ec2" + ] + } + }, + { + "refId": "B", + "queryType": "", + "datasourceUid": "-100", + "model": { + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "A", + "reducer": "max", + "refId": "B", + "type": "reduce" + }, + "relativeTimeRange": { + "from": 300, + "to": 0 + } + }, + { + "refId": "C", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [0], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": ["B"] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "B", + "refId": "C", + "type": "threshold" + } + } + ], + "noDataState": "OK", + "execErrState": "KeepLast", + "for": "5m", + "frequency": "1m", + "annotations": { + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "74", + "description": "[Description and instructions for this alert](https://dev.azure.com/dnceng/internal/_wiki/wikis/DNCEng%20Services%20Wiki/1350/-Alert-PCS-container-job-execution-failing)\\n\\nPlease note that this alert will fire every 12 hours as the list of failed jobs can change" + }, + "labels": { + "NotificationId": "0a5c68b0daf846ef83a66c6c70fd24ad" + }, + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "74", + "folderUID": "arcade-services", + "ruleGroup": "PCS Alerts", + "intervalMs": 900000, + "isPaused": false, + "notification_settings": { + "receiver": ".NET Status Alert (no image, 12h reminder)", + "group_wait": "5m", + "repeat_interval": "12h" + } +} diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-disk-space-issues.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-disk-space-issues.alert.json new file mode 100644 index 000000000..aa886e7fd --- /dev/null +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-disk-space-issues.alert.json @@ -0,0 +1,113 @@ +{ + "uid": "pcs-disk-space-issues", + "title": "PCS Disk Space Issues alert", + "condition": "C", + "data": [ + { + "refId": "A", + "queryType": "Azure Log Analytics", + "datasourceUid": "F2XodEi7z", + "relativeTimeRange": { + "from": 300, + "to": 0 + }, + "model": { + "azureLogAnalytics": { + "dashboardTime": true, + "query": "exceptions\r\n| where timestamp > now()-6h and (outerMessage contains \"No space left on device\" or innermostMessage contains \"No space left on device\")\r\n| summarize TotalCount=count() by bin(timestamp, 1h)\r\n| order by timestamp asc", + "resources": [ + "[parameter(product-construction-service-appinsights-resourcepath)]" + ], + "resultFormat": "time_series", + "workspace": "[parameter(product-construction-service-workspace-resourcepath)]" + }, + "azureMonitor": { + "dimensionFilters": [], + "timeGrain": "auto" + }, + "datasource": { + "type": "grafana-azure-monitor-datasource", + "uid": "F2XodEi7z" + }, + "queryType": "Azure Log Analytics", + "refId": "A", + "subscription": "[parameter(dotnet-product-construction-services-subscriptionid)]", + "subscriptions": [ + "fbd6122a-9ad3-42e4-976e-bccb82486856", + "e6b5f9f5-0ca4-4351-879b-014d78400ec2" + ] + } + }, + { + "refId": "B", + "queryType": "", + "datasourceUid": "-100", + "model": { + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "A", + "reducer": "count", + "refId": "B", + "type": "reduce" + }, + "relativeTimeRange": { + "from": 300, + "to": 0 + } + }, + { + "refId": "C", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [0], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": ["B"] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "B", + "refId": "C", + "type": "threshold" + } + } + ], + "noDataState": "OK", + "execErrState": "KeepLast", + "for": "5m", + "frequency": "5m", + "annotations": { + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "72", + "description": "[!IMPORTANT]\n[Description and instructions for this alert](https://dev.azure.com/dnceng/internal/_wiki/wikis/DNCEng%20Services%20Wiki/1346/-Alert-PCS-Disk-Space-Issues)\n\nThe PCS service is running out of disk space.\n\n@dotnet/prodconsvcs" + }, + "labels": { + "NotificationId": "aa1fe025a8954b6cad9866354ca041ee" + }, + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "72", + "folderUID": "arcade-services", + "ruleGroup": "PCS Alerts", + "intervalMs": 900000, + "isPaused": false, + "notification_settings": { + "receiver": ".NET Status Alert", + "group_wait": "5m", + "repeat_interval": "4h" + } +} diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-exceptions-high.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-exceptions-high.alert.json new file mode 100644 index 000000000..0f6ee8e3f --- /dev/null +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-exceptions-high.alert.json @@ -0,0 +1,117 @@ +{ + "uid": "pcs-exceptions-high", + "title": "PCS Exceptions High", + "condition": "C", + "data": [ + { + "refId": "A", + "queryType": "Azure Log Analytics", + "datasourceUid": "F2XodEi7z", + "relativeTimeRange": { + "from": 300, + "to": 0 + }, + "model": { + "azureLogAnalytics": { + "dashboardTime": true, + "query": "exceptions\r\n| where $__timeFilter(timestamp) and problemId !contains \"SpaDefaultPageMiddleware\"\r\n| summarize Exceptions=count() by bin(timestamp, $__interval)\r\n| order by timestamp asc", + "resources": [ + "[parameter(product-construction-service-appinsights-resourcepath)]" + ], + "resultFormat": "time_series", + "workspace": "[parameter(default-workspace-resourcepath)]" + }, + "azureMonitor": { + "dimensionFilters": [], + "timeGrain": "auto" + }, + "datasource": { + "type": "grafana-azure-monitor-datasource", + "uid": "F2XodEi7z" + }, + "queryType": "Azure Log Analytics", + "refId": "A", + "subscription": "[parameter(dotnet-eng-appinsights-subscriptionid)]", + "subscriptions": [ + "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", + "cab65fc3-d077-467d-931f-3932eabf36d3" + ] + } + }, + { + "refId": "B", + "queryType": "", + "datasourceUid": "-100", + "relativeTimeRange": { + "from": 300, + "to": 0 + }, + "model": { + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "A", + "reducer": "mean", + "refId": "B", + "type": "reduce" + } + }, + { + "refId": "C", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [ + 15 + ], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "B" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "B", + "refId": "C", + "type": "threshold" + } + } + ], + "noDataState": "OK", + "execErrState": "KeepLast", + "for": "5m", + "frequency": "1m", + "annotations": { + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "46", + "description": "[!IMPORTANT]\n[Description and instructions for this alert](https://dev.azure.com/dnceng/internal/_wiki/wikis/DNCEng%20Services%20Wiki/1311/-Alert-PCS-Exceptions-High)\n\nThe PCS background work items started to fail frequently.\n\n@dotnet/prodconsvcs" + }, + "labels": { + "NotificationId": "08f669cc1c3d325ee488666464" + }, + "dashboardUid": "arcadeAvailability", + "panelId": "46", + "folderUID": "arcade-services", + "ruleGroup": "PCS Alerts", + "intervalMs": 900000, + "isPaused": false, + "notification_settings": { + "receiver": ".NET Status Alert", + "group_wait": "5m", + "repeat_interval": "4h" + } +} \ No newline at end of file diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-git-push-success-rate.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-git-push-success-rate.alert.json new file mode 100644 index 000000000..d8a8cbb3c --- /dev/null +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-git-push-success-rate.alert.json @@ -0,0 +1,117 @@ +{ + "uid": "pcs-git-push-success-rate", + "title": "Git Push success rate alert", + "condition": "C", + "data": [ + { + "refId": "A", + "queryType": "Azure Log Analytics", + "datasourceUid": "F2XodEi7z", + "model": { + "azureLogAnalytics": { + "dashboardTime": true, + "query": "customEvents\r\n| where name == \"GitPush\" and $__timeFilter(timestamp)\r\n| extend Success = tobool(customDimensions[\"Success\"])\r\n| summarize SuccessRate=100*countif(Success == true)/count() by bin(timestamp, $__interval)\r\n| order by timestamp asc", + "resources": [ + "[parameter(product-construction-service-appinsights-resourcepath)]" + ], + "resultFormat": "time_series", + "workspace": "[parameter(product-construction-service-workspace-resourcepath)]" + }, + "azureMonitor": { + "dimensionFilters": [], + "timeGrain": "auto" + }, + "datasource": { + "type": "grafana-azure-monitor-datasource", + "uid": "F2XodEi7z" + }, + "queryType": "Azure Log Analytics", + "refId": "A", + "subscription": "[parameter(dotnet-product-construction-services-subscriptionid)]", + "subscriptions": [ + "fbd6122a-9ad3-42e4-976e-bccb82486856", + "e6b5f9f5-0ca4-4351-879b-014d78400ec2" + ] + }, + "relativeTimeRange": { + "from": 300, + "to": 0 + } + }, + { + "refId": "B", + "queryType": "", + "datasourceUid": "-100", + "model": { + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "A", + "reducer": "mean", + "refId": "B", + "type": "reduce" + }, + "relativeTimeRange": { + "from": 300, + "to": 0 + } + }, + { + "refId": "C", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [ + 80 + ], + "type": "lt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "B" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "B", + "refId": "C", + "type": "threshold" + } + } + ], + "noDataState": "KeepLast", + "execErrState": "KeepLast", + "for": "5m", + "frequency": "1m", + "annotations": { + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "75", + "description": "[!IMPORTANT]\\n[Description and instructions for this alert](https://dev.azure.com/dnceng/internal/_wiki/wikis/DNCEng%20Services%20Wiki/1318/-Alert-PCS-high-git-push-failure-rate)\\n\\nPCS has a high `git push` failure rate, please investigate\\n\\n@dotnet/prodconsvcs" + }, + "labels": { + "NotificationId": "6ggqnvwrunnru1zfl4g42dn9qjzanb8a" + }, + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "75", + "folderUID": "arcade-services", + "ruleGroup": "PCS Alerts", + "intervalSeconds": 60, + "isPaused": false, + "notification_settings": { + "receiver": ".NET Status Alert", + "group_wait": "5m", + "repeat_interval": "4h" + } +} \ No newline at end of file diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-work-item-success-rate.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-work-item-success-rate.alert.json new file mode 100644 index 000000000..a059858b8 --- /dev/null +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/pcs-work-item-success-rate.alert.json @@ -0,0 +1,153 @@ +{ + "uid": "pcs-work-item-success-rate", + "title": "PCS Work Item Success Rate alert", + "condition": "D", + "data": [ + { + "refId": "A", + "queryType": "Azure Log Analytics", + "datasourceUid": "F2XodEi7z", + "relativeTimeRange": { + "from": 300, + "to": 0 + }, + "model": { + "azureLogAnalytics": { + "dashboardTime": true, + "query": "customEvents\r\n| where name == \"WorkItemExecuted\" and $__timeFilter(timestamp)\r\n| extend Success = tobool(customDimensions[\"Success\"])\r\n| summarize Successful = countif(Success == true), Failed = countif(Success == false) by bin(timestamp, $__interval)\r\n| order by timestamp asc", + "resources": [ + "[parameter(product-construction-service-appinsights-resourcepath)]" + ], + "resultFormat": "time_series", + "workspace": "[parameter(default-workspace-resourcepath)]" + }, + "azureMonitor": { + "dimensionFilters": [], + "timeGrain": "auto" + }, + "datasource": { + "type": "grafana-azure-monitor-datasource", + "uid": "F2XodEi7z" + }, + "queryType": "Azure Log Analytics", + "refId": "A", + "subscription": "[parameter(dotnet-eng-appinsights-subscriptionid)]", + "subscriptions": [ + "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", + "cab65fc3-d077-467d-931f-3932eabf36d3" + ] + } + }, + { + "refId": "B", + "queryType": "Azure Log Analytics", + "datasourceUid": "F2XodEi7z", + "model": { + "azureLogAnalytics": { + "dashboardTime": true, + "query": "customEvents\r\n| where name == \"WorkItemExecuted\" and $__timeFilter(timestamp)\r\n| extend Success = tobool(customDimensions[\"Success\"])\r\n| summarize SuccessRate=100*countif(Success == true)/count() by bin(timestamp, $__interval)\r\n| order by timestamp asc", + "resources": [ + "[parameter(product-construction-service-appinsights-resourcepath)]" + ], + "resultFormat": "time_series", + "workspace": "[parameter(default-workspace-resourcepath)]" + }, + "azureMonitor": { + "dimensionFilters": [], + "timeGrain": "auto" + }, + "datasource": { + "type": "grafana-azure-monitor-datasource", + "uid": "F2XodEi7z" + }, + "hide": false, + "queryType": "Azure Log Analytics", + "refId": "B", + "subscription": "[parameter(dotnet-eng-appinsights-subscriptionid)]", + "subscriptions": [ + "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", + "cab65fc3-d077-467d-931f-3932eabf36d3" + ] + }, + "relativeTimeRange": { + "from": 300, + "to": 0 + } + }, + { + "refId": "C", + "queryType": "", + "datasourceUid": "-100", + "relativeTimeRange": { + "from": 300, + "to": 0 + }, + "model": { + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "B", + "reducer": "mean", + "refId": "C", + "type": "reduce" + } + }, + { + "refId": "D", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [ + 74 + ], + "type": "lt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "C" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "C", + "refId": "D", + "type": "threshold" + } + } + ], + "noDataState": "KeepLast", + "execErrState": "KeepLast", + "for": "5m", + "frequency": "1m", + "annotations": { + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "64", + "description": "[!IMPORTANT]\n[Description and instructions for this alert](https://dev.azure.com/dnceng/internal/_wiki/wikis/DNCEng%20Services%20Wiki/1310/-Alert-PCS-Work-Item-Success-Rate-alert)\n\nThe PCS background work items started to fail frequently.\n\n@dotnet/prodconsvcs" + }, + "labels": { + "NotificationId": "d71fe025a8954b6cad9866354ca041ee" + }, + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "64", + "folderUID": "arcade-services", + "ruleGroup": "PCS Alerts", + "intervalMs": 900000, + "isPaused": false, + "notification_settings": { + "receiver": ".NET Status Alert", + "group_wait": "5m", + "repeat_interval": "4h" + } +} \ No newline at end of file diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/quota-eastus.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/quota-eastus.alert.json new file mode 100644 index 000000000..8e0db517e --- /dev/null +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/quota-eastus.alert.json @@ -0,0 +1,116 @@ +{ + "uid": "quota-eastus", + "title": "Azure quota usage for east us", + "condition": "C", + "data": [ + { + "refId": "A", + "queryType": "Azure Log Analytics", + "datasourceUid": "F2XodEi7z", + "model": { + "azureLogAnalytics": { + "query": "customEvents\n| where $__timeFilter(timestamp)\n| where name == 'AzureResourceUsage'\n| project\n timestamp,\n resource = tostring(customDimensions.name),\n location = tostring(customDimensions.location),\n current = toreal(customMeasurements.current),\n ['limit'] = toreal(customMeasurements.['limit']),\n subscription = tostring(customDimensions.subscription)\n| where resource != 'NetworkWatchers'\n| where location == 'eastus'\n| where subscription in (\"cab65fc3-d077-467d-931f-3932eabf36d3\", \"68672ab8-de0c-40f1-8d1b-ffb20bd62c0f\")\n| extend ['percent of limit'] = iff(['limit'] == 0, 0.0, current/['limit'] * 100)\n| summarize ['percent of limit'] = max(['percent of limit']) by bin(timestamp, $__interval), resource\n| order by timestamp asc, resource", + "resource": "[parameter(dotnet-eng-appinsights-resourcepath)]", + "resultFormat": "time_series", + "workspace": "[parameter(default-workspace-resourcepath)]" + }, + "azureMonitor": { + "dimensionFilter": "*", + "dimensionFilters": [], + "timeGrain": "auto", + "top": "10" + }, + "datasource": { + "type": "grafana-azure-monitor-datasource", + "uid": "F2XodEi7z" + }, + "queryType": "Azure Log Analytics", + "refId": "A", + "subscription": "[parameter(dotnet-eng-appinsights-subscriptionid)]", + "subscriptions": [ + "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", + "cab65fc3-d077-467d-931f-3932eabf36d3" + ] + }, + "relativeTimeRange": { + "from": 300, + "to": 0 + } + }, + { + "refId": "B", + "queryType": "", + "datasourceUid": "-100", + "model": { + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "A", + "reducer": "mean", + "refId": "B", + "type": "reduce" + }, + "relativeTimeRange": { + "from": 300, + "to": 0 + } + }, + { + "refId": "C", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [ + 95 + ], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "B" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "B", + "refId": "C", + "type": "threshold" + } + } + ], + "noDataState": "KeepLast", + "execErrState": "KeepLast", + "for": "5m", + "frequency": "1m", + "annotations": { + "__dashboardUid__": "quota", + "__panelId__": "24", + "description": "An Azure Resource Quota is nearing its limit in region eastus!" + }, + "labels": { + "NotificationId": "b50b57fa7d1840438da5232711af4485" + }, + "__dashboardUid__": "quota", + "__panelId__": "24", + "folderUID": "arcade-services", + "ruleGroup": "Azure Quota Alerts", + "intervalMs": 900000, + "isPaused": false, + "notification_settings": { + "receiver": ".NET Status Alert", + "group_wait": "5m", + "repeat_interval": "4h" + } +} \ No newline at end of file diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/quota-westus.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/quota-westus.alert.json new file mode 100644 index 000000000..cba370182 --- /dev/null +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/quota-westus.alert.json @@ -0,0 +1,116 @@ +{ + "uid": "quota-westus", + "title": "Azure quota usage for west us", + "condition": "C", + "data": [ + { + "refId": "A", + "queryType": "Azure Log Analytics", + "datasourceUid": "F2XodEi7z", + "model": { + "azureLogAnalytics": { + "query": "customEvents\n| where $__timeFilter(timestamp)\n| where name == 'AzureResourceUsage'\n| project\n timestamp,\n resource = tostring(customDimensions.name),\n location = tostring(customDimensions.location),\n current = toreal(customMeasurements.current),\n ['limit'] = toreal(customMeasurements.['limit']),\n subscription = tostring(customDimensions.subscription)\n| where resource != 'NetworkWatchers'\n| where location == 'westus'\n| where subscription in (\"cab65fc3-d077-467d-931f-3932eabf36d3\", \"68672ab8-de0c-40f1-8d1b-ffb20bd62c0f\")\n| extend ['percent of limit'] = iff(['limit'] == 0, 0.0, current/['limit'] * 100)\n| summarize ['percent of limit'] = max(['percent of limit']) by bin(timestamp, $__interval), resource\n| order by timestamp asc, resource", + "resource": "[parameter(dotnet-eng-appinsights-resourcepath)]", + "resultFormat": "time_series", + "workspace": "[parameter(default-workspace-resourcepath)]" + }, + "azureMonitor": { + "dimensionFilter": "*", + "dimensionFilters": [], + "timeGrain": "auto", + "top": "10" + }, + "datasource": { + "type": "grafana-azure-monitor-datasource", + "uid": "F2XodEi7z" + }, + "queryType": "Azure Log Analytics", + "refId": "A", + "subscription": "[parameter(dotnet-eng-appinsights-subscriptionid)]", + "subscriptions": [ + "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", + "cab65fc3-d077-467d-931f-3932eabf36d3" + ] + }, + "relativeTimeRange": { + "from": 300, + "to": 0 + } + }, + { + "refId": "B", + "queryType": "", + "datasourceUid": "-100", + "model": { + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "A", + "reducer": "mean", + "refId": "B", + "type": "reduce" + }, + "relativeTimeRange": { + "from": 300, + "to": 0 + } + }, + { + "refId": "C", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [ + 95 + ], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "B" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "B", + "refId": "C", + "type": "threshold" + } + } + ], + "noDataState": "KeepLast", + "execErrState": "KeepLast", + "for": "5m", + "frequency": "1m", + "annotations": { + "__dashboardUid__": "quota", + "__panelId__": "12", + "description": "An Azure Resource Quota is nearing its limit in region westus!" + }, + "labels": { + "NotificationId": "e2be2ec3e22e46d28730bab54ff8fa77" + }, + "__dashboardUid__": "quota", + "__panelId__": "12", + "folderUID": "arcade-services", + "ruleGroup": "Azure Quota Alerts", + "intervalMs": 900000, + "isPaused": false, + "notification_settings": { + "receiver": ".NET Status Alert", + "group_wait": "5m", + "repeat_interval": "4h" + } +} \ No newline at end of file diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/quota-westus2.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/quota-westus2.alert.json new file mode 100644 index 000000000..cc70acc06 --- /dev/null +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/quota-westus2.alert.json @@ -0,0 +1,112 @@ +{ + "uid": "quota-westus2", + "title": "Azure quota usage for west us 2", + "condition": "C", + "data": [ + { + "refId": "A", + "queryType": "Azure Log Analytics", + "datasourceUid": "F2XodEi7z", + "model": { + "azureLogAnalytics": { + "query": "customEvents\n| where $__timeFilter(timestamp)\n| where name == 'AzureResourceUsage'\n| project\n timestamp,\n resource = tostring(customDimensions.name),\n location = tostring(customDimensions.location),\n current = toreal(customMeasurements.current),\n ['limit'] = toreal(customMeasurements.['limit']),\n subscription = tostring(customDimensions.subscription)\n| where resource != 'NetworkWatchers' and resource != \"standardDASv4Family\"\n| where location == 'westus2'\n| where subscription in (\"cab65fc3-d077-467d-931f-3932eabf36d3\", \"68672ab8-de0c-40f1-8d1b-ffb20bd62c0f\")\n| extend ['percent of limit'] = iff(['limit'] == 0, 0.0, current/['limit'] * 100)\n| summarize ['percent of limit'] = max(['percent of limit']) by bin(timestamp, $__interval), resource\n| order by timestamp asc, resource", + "resource": "[parameter(dotnet-eng-appinsights-resourcepath)]", + "resultFormat": "time_series", + "workspace": "[parameter(default-workspace-resourcepath)]" + }, + "azureMonitor": { + "dimensionFilter": "*", + "dimensionFilters": [], + "timeGrain": "auto", + "top": "10" + }, + "datasource": { + "type": "grafana-azure-monitor-datasource", + "uid": "F2XodEi7z" + }, + "queryType": "Azure Log Analytics", + "refId": "A", + "subscription": "[parameter(dotnet-eng-appinsights-subscriptionid)]", + "subscriptions": [ + "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", + "cab65fc3-d077-467d-931f-3932eabf36d3" + ] + }, + "relativeTimeRange": { + "from": 600, + "to": 0 + } + }, + { + "refId": "B", + "queryType": "", + "datasourceUid": "-100", + "model": { + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "A", + "reducer": "mean", + "refId": "B", + "type": "reduce" + }, + "relativeTimeRange": { + "from": 600, + "to": 0 + } + }, + { + "refId": "C", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [95], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": ["B"] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "B", + "refId": "C", + "type": "threshold" + } + } + ], + "noDataState": "KeepLast", + "execErrState": "KeepLast", + "for": "5m", + "frequency": "1m", + "annotations": { + "__dashboardUid__": "quota", + "__panelId__": "13", + "description": "An Azure Resource Quota is nearing its limit in region westus2!" + }, + "labels": { + "NotificationId": "44aff3c937c042caa09f821ae923c26c" + }, + "__dashboardUid__": "quota", + "__panelId__": "13", + "folderUID": "arcade-services", + "ruleGroup": "Azure Quota Alerts", + "intervalMs": 900000, + "isPaused": false, + "notification_settings": { + "receiver": ".NET Status Alert", + "group_wait": "5m", + "repeat_interval": "4h" + } +} diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/source-dot-net-availability.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/source-dot-net-availability.alert.json new file mode 100644 index 000000000..e56a3241c --- /dev/null +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/source-dot-net-availability.alert.json @@ -0,0 +1,138 @@ +{ + "uid": "source-dot-net-availability", + "title": "source.dot.net Availability", + "condition": "C", + "data": [ + { + "refId": "A", + "queryType": "Azure Monitor", + "datasourceUid": "F2XodEi7z", + "relativeTimeRange": { + "from": 300, + "to": 0 + }, + "model": { + "azureMonitor": { + "aggregation": "Average", + "alias": "", + "allowedTimeGrainsMs": [ + 60000, + 300000, + 900000, + 1800000, + 3600000, + 21600000, + 43200000, + 86400000 + ], + "dimensionFilters": [ + { + "dimension": "availabilityResult/name", + "filter": "source-dot-net", + "operator": "eq" + } + ], + "metricDefinition": "Microsoft.Insights/components", + "metricName": "availabilityResults/availabilityPercentage", + "metricNamespace": "microsoft.insights/components", + "resources": [ + { + "metricNamespace": "microsoft.insights/components", + "resourceGroup": "dotnet-eng-cluster", + "resourceName": "dotnet-eng", + "subscription": "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f" + } + ], + "timeGrain": "auto", + "top": "10" + }, + "datasource": { + "type": "grafana-azure-monitor-datasource", + "uid": "F2XodEi7z" + }, + "queryType": "Azure Monitor", + "refId": "A", + "subscription": "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", + "subscriptions": [ + "68672ab8-de0c-40f1-8d1b-ffb20bd62c0f", + "cab65fc3-d077-467d-931f-3932eabf36d3" + ] + } + }, + { + "refId": "B", + "queryType": "", + "datasourceUid": "-100", + "model": { + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "A", + "reducer": "mean", + "refId": "B", + "type": "reduce" + }, + "relativeTimeRange": { + "from": 300, + "to": 0 + } + }, + { + "refId": "C", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [ + 60 + ], + "type": "lt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "B" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "B", + "refId": "C", + "type": "threshold" + } + } + ], + "noDataState": "KeepLast", + "execErrState": "KeepLast", + "for": "15m", + "frequency": "1m", + "annotations": { + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "16", + "description": "source.dot.net availability is low!" + }, + "labels": { + "NotificationId": "fb8faaf7600740f98a1c2db076cd1712" + }, + "__dashboardUid__": "arcadeAvailability", + "__panelId__": "16", + "folderUID": "arcade-services", + "ruleGroup": "Source Browser Alerts", + "intervalSeconds": 900000, + "isPaused": false, + "notification_settings": { + "receiver": ".NET Status Alert", + "group_wait": "5m", + "repeat_interval": "4h" + } +} \ No newline at end of file diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/work-items-waiting-time-build-pools.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/work-items-waiting-time-build-pools.alert.json new file mode 100644 index 000000000..a4d269995 --- /dev/null +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/work-items-waiting-time-build-pools.alert.json @@ -0,0 +1,152 @@ +{ + "uid": "work-items-waiting-time-build-pools", + "title": "Work Items Waiting Time Is Too High (Build Pools)", + "condition": "D", + "data": [ + { + "refId": "A", + "queryType": "", + "datasourceUid": "OlcfOPi7z", + "relativeTimeRange": { + "from": 300, + "to": 0 + }, + "model": { + "database": "engineeringdata", + "datasource": { + "type": "grafana-azure-data-explorer-datasource", + "uid": "OlcfOPi7z" + }, + "expression": { + "groupBy": { + "expressions": [], + "type": "and" + }, + "reduce": { + "expressions": [], + "type": "and" + }, + "where": { + "expressions": [], + "type": "and" + } + }, + "query": "WorkItems \n| where $__timeFilter(Queued)\n| project QueueName = tolower(QueueName), duration = datetime_diff('second', Started, Queued), Queued\n| where QueueName !contains \"osx\" and QueueName !contains \"perf\" and QueueName !contains \"arm\" and QueueName !contains \"arcade\" and QueueName !contains \"xaml\" and QueueName !contains \"appcompat\"\n| where QueueName contains \"buildpool\"\n| summarize Percentile95 = percentile(duration, 95) / 60 by bin(Queued, 1d)\n| order by Queued asc ", + "querySource": "raw", + "rawMode": true, + "refId": "A", + "resultFormat": "table" + } + }, + { + "refId": "B", + "queryType": "", + "datasourceUid": "OlcfOPi7z", + "relativeTimeRange": { + "from": 300, + "to": 0 + }, + "model": { + "database": "engineeringdata", + "datasource": { + "type": "grafana-azure-data-explorer-datasource", + "uid": "OlcfOPi7z" + }, + "expression": { + "groupBy": { + "expressions": [], + "type": "and" + }, + "reduce": { + "expressions": [], + "type": "and" + }, + "where": { + "expressions": [], + "type": "and" + } + }, + "query": "WorkItems \n| where $__timeFilter(Queued)\n| project QueueName = tolower(QueueName), duration = datetime_diff('second', Started, Queued), Queued\n| where QueueName !contains \"osx\" and QueueName !contains \"perf\" and QueueName !contains \"arm\" and QueueName !contains \"arcade\" and QueueName !contains \"xaml\" and QueueName !contains \"appcompat\"\n| where QueueName contains \"buildpool\"\n| summarize Percentile50 = percentile(duration, 50) / 60 by bin(Queued, 1d)\n| order by Queued asc ", + "querySource": "raw", + "rawMode": true, + "refId": "B", + "resultFormat": "table" + } + }, + { + "refId": "C", + "queryType": "", + "datasourceUid": "-100", + "model": { + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "B", + "reducer": "min", + "refId": "C", + "type": "reduce" + }, + "relativeTimeRange": { + "from": 300, + "to": 0 + } + }, + { + "refId": "D", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [ + 30 + ], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "C" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "C", + "refId": "D", + "type": "threshold" + } + } + ], + "noDataState": "OK", + "execErrState": "Alerting", + "for": "5m", + "frequency": "5m", + "annotations": { + "__dashboardUid__": "home", + "__panelId__": "4", + "description": "95 percentile of work item waiting times is over 30 minutes. BuildPool queues only." + }, + "labels": { + "NotificationId": "work-items-waiting-time-build-pools" + }, + "__dashboardUid__": "home", + "__panelId__": "4", + "folderUID": "arcade-services", + "ruleGroup": "Helix Queue Alerts", + "intervalMs": 300000, + "isPaused": false, + "notification_settings": { + "receiver": ".NET Status Alert", + "group_wait": "5m", + "repeat_interval": "4h" + } +} \ No newline at end of file diff --git a/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/work-items-waiting-time-test-queues.alert.json b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/work-items-waiting-time-test-queues.alert.json new file mode 100644 index 000000000..aa8895e42 --- /dev/null +++ b/src/Monitoring/Monitoring.ArcadeServices/alertrules/Staging/work-items-waiting-time-test-queues.alert.json @@ -0,0 +1,152 @@ +{ + "uid": "work-items-waiting-time-test-queues", + "title": "Work Items Waiting Time Is Too High (Test Queues)", + "condition": "D", + "data": [ + { + "refId": "A", + "queryType": "", + "datasourceUid": "OlcfOPi7z", + "relativeTimeRange": { + "from": 300, + "to": 0 + }, + "model": { + "database": "engineeringdata", + "datasource": { + "type": "grafana-azure-data-explorer-datasource", + "uid": "OlcfOPi7z" + }, + "expression": { + "groupBy": { + "expressions": [], + "type": "and" + }, + "reduce": { + "expressions": [], + "type": "and" + }, + "where": { + "expressions": [], + "type": "and" + } + }, + "query": "WorkItems \n| where $__timeFilter(Queued)\n| project QueueName = tolower(QueueName), duration = datetime_diff('second', Started, Queued), Queued\n| where QueueName !contains \"osx\" and QueueName !contains \"perf\" and QueueName !contains \"arm\" and QueueName !contains \"arcade\" and QueueName !contains \"xaml\" and QueueName !contains \"appcompat\" and QueueName !contains \".tof\"\n| where QueueName !contains \"buildpool\"\n| summarize Percentile95 = percentile(duration, 95) / 60 by bin(Queued, 1d)\n| order by Queued asc ", + "querySource": "raw", + "rawMode": true, + "refId": "A", + "resultFormat": "table" + } + }, + { + "refId": "B", + "queryType": "", + "datasourceUid": "OlcfOPi7z", + "relativeTimeRange": { + "from": 300, + "to": 0 + }, + "model": { + "database": "engineeringdata", + "datasource": { + "type": "grafana-azure-data-explorer-datasource", + "uid": "OlcfOPi7z" + }, + "expression": { + "groupBy": { + "expressions": [], + "type": "and" + }, + "reduce": { + "expressions": [], + "type": "and" + }, + "where": { + "expressions": [], + "type": "and" + } + }, + "query": "WorkItems \n| where $__timeFilter(Queued)\n| project QueueName = tolower(QueueName), duration = datetime_diff('second', Started, Queued), Queued\n| where QueueName !contains \"osx\" and QueueName !contains \"perf\" and QueueName !contains \"arm\" and QueueName !contains \"arcade\" and QueueName !contains \"xaml\" and QueueName !contains \"appcompat\"\n| where QueueName !contains \"buildpool\"\n| summarize Percentile50 = percentile(duration, 50) / 60 by bin(Queued, 1d)\n| order by Queued asc ", + "querySource": "raw", + "rawMode": true, + "refId": "B", + "resultFormat": "table" + } + }, + { + "refId": "C", + "queryType": "", + "datasourceUid": "-100", + "model": { + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "A", + "reducer": "min", + "refId": "C", + "type": "reduce" + }, + "relativeTimeRange": { + "from": 300, + "to": 0 + } + }, + { + "refId": "D", + "queryType": "", + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [ + 35 + ], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "C" + ] + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "-100" + }, + "expression": "C", + "refId": "D", + "type": "threshold" + } + } + ], + "noDataState": "OK", + "execErrState": "Alerting", + "for": "5m", + "frequency": "5m", + "annotations": { + "__dashboardUid__": "home", + "__panelId__": "10", + "description": "95 percentile of work item waiting times is over 35 minutes. Test queues only." + }, + "labels": { + "NotificationId": "work-items-waiting-time-test-queues" + }, + "__dashboardUid__": "home", + "__panelId__": "10", + "folderUID": "arcade-services", + "ruleGroup": "Helix Queue Alerts", + "intervalMs": 300000, + "isPaused": false, + "notification_settings": { + "receiver": ".NET Status Alert", + "group_wait": "5m", + "repeat_interval": "4h" + } +} \ No newline at end of file diff --git a/src/Monitoring/Monitoring.ArcadeServices/dashboard/arcade-services/arcadeAvailability.dashboard.json b/src/Monitoring/Monitoring.ArcadeServices/dashboard/arcade-services/arcadeAvailability.dashboard.json index 9299dae5a..c6503e74e 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/dashboard/arcade-services/arcadeAvailability.dashboard.json +++ b/src/Monitoring/Monitoring.ArcadeServices/dashboard/arcade-services/arcadeAvailability.dashboard.json @@ -17,15 +17,28 @@ "type": "dashboard" }, { - "datasource": "Deployment Annotations", - "enable": false, + "datasource": { + "type": "yesoreyeram-infinity-datasource", + "uid": "deployment-annotations-infinity" + }, + "enable": true, + "hide": false, "iconColor": "blue", "name": "Deployments", "target": { - "limit": 100, - "matchAny": false, - "tags": [], - "type": "dashboard" + "columns": [], + "filters": [], + "type": "json", + "source": "url", + "url": "/api/annotations/grafana?from=${__from:date:iso}&to=${__to:date:iso}", + "url_options": { + "data": "", + "method": "GET" + }, + "format": "dataframe", + "format_version": "1.0", + "parser": "backend", + "root_selector": "" } } ] @@ -130,7 +143,6 @@ "showHeader": true, "sortBy": [] }, - "pluginVersion": "8.3.6", "targets": [ { "azureLogAnalytics": { @@ -1195,7 +1207,8 @@ "dimensionFilters": [], "metricDefinition": "Microsoft.Insights/components", "metricName": "pcs.queue.wait_time", - "metricNamespace": "Azure.ApplicationInsights", + "customNamespace": "Azure.ApplicationInsights", + "metricNamespace": "Microsoft.Insights/components", "resourceGroup": "[parameter(product-construction-service-resourcegroup)]", "resourceName": "[parameter(product-construction-service-appinsights-resourcename)]", "timeGrain": "auto" @@ -2198,7 +2211,7 @@ "x": 12, "y": 57 }, - "id": 66, + "id": 75, "options": { "legend": { "calcs": [], @@ -2516,15 +2529,15 @@ "dashboardAlerts": true, "dashboardTitle": "", "maxItems": 10, + "showInactiveAlerts": true, "showOptions": "current", "sortOrder": 1, "stateFilter": { - "alerting": false, - "execution_error": false, - "no_data": false, - "ok": false, - "paused": false, - "pending": false + "firing": true, + "noData": true, + "normal": true, + "error": true, + "pending": true }, "tags": [] }, @@ -3322,10 +3335,12 @@ }, "query": { "params": [ - "A", - "1h", - "now-10m" - ] + "A" + ], + "relativeTimeRange": { + "from": 3600, + "to": 600 + } }, "reducer": { "params": [], @@ -3686,5 +3701,6 @@ "timepicker": {}, "timezone": "", "title": "Service Availability", + "uid": "arcadeAvailability", "weekStart": "" } \ No newline at end of file diff --git a/src/Monitoring/Monitoring.ArcadeServices/dashboard/arcade-services/quota.dashboard.json b/src/Monitoring/Monitoring.ArcadeServices/dashboard/arcade-services/quota.dashboard.json index 779a1ba7b..244cf0c4c 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/dashboard/arcade-services/quota.dashboard.json +++ b/src/Monitoring/Monitoring.ArcadeServices/dashboard/arcade-services/quota.dashboard.json @@ -17,15 +17,28 @@ "type": "dashboard" }, { - "datasource": "Deployment Annotations", + "datasource": { + "type": "yesoreyeram-infinity-datasource", + "uid": "deployment-annotations-infinity" + }, "enable": true, + "hide": false, "iconColor": "blue", "name": "Deployments", "target": { - "limit": 100, - "matchAny": false, - "tags": [], - "type": "dashboard" + "columns": [], + "filters": [], + "type": "json", + "source": "url", + "url": "/api/annotations/grafana?from=${__from:date:iso}&to=${__to:date:iso}", + "url_options": { + "data": "", + "method": "GET" + }, + "format": "dataframe", + "format_version": "1.0", + "parser": "backend", + "root_selector": "" } } ] @@ -67,15 +80,15 @@ "dashboardAlerts": true, "dashboardTitle": "", "maxItems": 10, + "showInactiveAlerts": true, "showOptions": "current", "sortOrder": 1, "stateFilter": { - "alerting": false, - "execution_error": false, - "no_data": false, - "ok": false, - "paused": false, - "pending": false + "firing": true, + "noData": true, + "normal": true, + "error": true, + "pending": true }, "tags": [] }, diff --git a/src/Monitoring/Monitoring.ArcadeServices/dashboard/general/home.dashboard.json b/src/Monitoring/Monitoring.ArcadeServices/dashboard/general/home.dashboard.json new file mode 100644 index 000000000..ab2671427 --- /dev/null +++ b/src/Monitoring/Monitoring.ArcadeServices/dashboard/general/home.dashboard.json @@ -0,0 +1,612 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "links": [], + "liveNow": false, + "panels": [ + { + "gridPos": { + "h": 8, + "w": 14, + "x": 0, + "y": 0 + }, + "id": 2, + "options": { + "content": "\n# .NET Engineering Systems Monitoring\n\nFor questions or permission issues, email [dnceng@microsoft.com](mailto:dnceng@microsoft.com)\n\nThis monitoring site is used to monitor all services managed by the .NET Engineering team.\nFor information about what sorts of things are monitored, and how to go about adding new monitoring or alerting, see the [Guidance](https://github.com/dotnet/core-eng/blob/master/Documentation/Alerting.md).\n\nTo see information about privacy and cookies visit: [Microsoft Privacy Statement](https://go.microsoft.com/fwlink/?LinkId=521839).\n\n", + "mode": "markdown" + }, + "title": "Introduction", + "type": "text" + }, + { + "gridPos": { + "h": 6, + "w": 7, + "x": 17, + "y": 0 + }, + "id": 6, + "options": { + "folderId": "[parameter(arcade-services-folderid)]", + "maxItems": 10, + "query": "", + "showHeadings": false, + "showRecentlyViewed": false, + "showSearch": true, + "showStarred": false, + "tags": [] + }, + "title": "arcade-services", + "type": "dashlist" + }, + { + "gridPos": { + "h": 7, + "w": 7, + "x": 17, + "y": 6 + }, + "id": 7, + "options": { + "folderId": "[parameter(helix-service-folderid)]", + "maxItems": 10, + "query": "", + "showHeadings": false, + "showRecentlyViewed": false, + "showSearch": true, + "showStarred": false, + "tags": [] + }, + "title": "helix-services", + "type": "dashlist" + }, + { + "alert": { + "alertRuleTags": {}, + "conditions": [ + { + "evaluator": { + "params": [ + 30 + ], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "A", + "5m", + "now" + ] + }, + "reducer": { + "params": [], + "type": "min" + }, + "type": "query" + } + ], + "executionErrorState": "alerting", + "for": "5m", + "frequency": "5m", + "handler": 1, + "message": "95 percentile of work item waiting times is over 30 minutes. BuildPool queues only.", + "name": "Work Items Waiting Time Is Too High (Build Pools) alert", + "noDataState": "ok", + "notifications": [] + }, + "datasource": { + "type": "grafana-azure-data-explorer-datasource", + "uid": "OlcfOPi7z" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "line+area" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "transparent", + "value": null + }, + { + "color": "red", + "value": 30 + } + ] + }, + "unit": "m" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 7, + "x": 0, + "y": 8 + }, + "id": 4, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "database": "engineeringdata", + "datasource": { + "type": "grafana-azure-data-explorer-datasource", + "uid": "OlcfOPi7z" + }, + "expression": { + "groupBy": { + "expressions": [], + "type": "and" + }, + "reduce": { + "expressions": [], + "type": "and" + }, + "where": { + "expressions": [], + "type": "and" + } + }, + "query": "WorkItems \n| where $__timeFilter(Queued)\n| project QueueName = tolower(QueueName), duration = datetime_diff('second', Started, Queued), Queued\n| where QueueName !contains \"osx\" and QueueName !contains \"perf\" and QueueName !contains \"arm\" and QueueName !contains \"arcade\" and QueueName !contains \"xaml\" and QueueName !contains \"appcompat\"\n| where QueueName contains \"buildpool\"\n| summarize Percentile95 = percentile(duration, 95) / 60 by bin(Queued, 1d)\n| order by Queued asc ", + "querySource": "raw", + "rawMode": true, + "refId": "A", + "resultFormat": "time_series" + }, + { + "database": "engineeringdata", + "datasource": { + "type": "grafana-azure-data-explorer-datasource", + "uid": "OlcfOPi7z" + }, + "expression": { + "groupBy": { + "expressions": [], + "type": "and" + }, + "reduce": { + "expressions": [], + "type": "and" + }, + "where": { + "expressions": [], + "type": "and" + } + }, + "hide": false, + "query": "WorkItems \n| where $__timeFilter(Queued)\n| project QueueName = tolower(QueueName), duration = datetime_diff('second', Started, Queued), Queued\n| where QueueName !contains \"osx\" and QueueName !contains \"perf\" and QueueName !contains \"arm\" and QueueName !contains \"arcade\" and QueueName !contains \"xaml\" and QueueName !contains \"appcompat\"\n| where QueueName contains \"buildpool\"\n| summarize Percentile50 = percentile(duration, 50) / 60 by bin(Queued, 1d)\n| order by Queued asc ", + "querySource": "raw", + "rawMode": true, + "refId": "B", + "resultFormat": "time_series" + } + ], + "thresholds": [ + { + "colorMode": "critical", + "op": "gt", + "value": 30, + "visible": true + } + ], + "title": "Work Items Waiting Time (Build Pools)", + "type": "timeseries" + }, + { + "alert": { + "alertRuleTags": {}, + "conditions": [ + { + "evaluator": { + "params": [ + 35 + ], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "A", + "5m", + "now" + ] + }, + "reducer": { + "params": [], + "type": "min" + }, + "type": "query" + } + ], + "executionErrorState": "alerting", + "for": "5m", + "frequency": "5m", + "handler": 1, + "message": "95 percentile of work item waiting times is over 35 minutes. Test queues only.", + "name": "Work Items Waiting Time Is Too High (Test Queues)", + "noDataState": "ok", + "notifications": [] + }, + "datasource": { + "type": "grafana-azure-data-explorer-datasource", + "uid": "OlcfOPi7z" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "line+area" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "transparent", + "value": null + }, + { + "color": "red", + "value": 30 + } + ] + }, + "unit": "m" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 7, + "x": 7, + "y": 8 + }, + "id": 10, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "database": "engineeringdata", + "datasource": { + "type": "grafana-azure-data-explorer-datasource", + "uid": "OlcfOPi7z" + }, + "expression": { + "groupBy": { + "expressions": [], + "type": "and" + }, + "reduce": { + "expressions": [], + "type": "and" + }, + "where": { + "expressions": [], + "type": "and" + } + }, + "query": "WorkItems \n| where $__timeFilter(Queued)\n| project QueueName = tolower(QueueName), duration = datetime_diff('second', Started, Queued), Queued\n| where QueueName !contains \"osx\" and QueueName !contains \"perf\" and QueueName !contains \"arm\" and QueueName !contains \"arcade\" and QueueName !contains \"xaml\" and QueueName !contains \"appcompat\" and QueueName !contains \".tof\"\n| where QueueName !contains \"buildpool\"\n| summarize Percentile95 = percentile(duration, 95) / 60 by bin(Queued, 1d)\n| order by Queued asc ", + "querySource": "raw", + "rawMode": true, + "refId": "A", + "resultFormat": "time_series" + }, + { + "database": "engineeringdata", + "datasource": { + "type": "grafana-azure-data-explorer-datasource", + "uid": "OlcfOPi7z" + }, + "expression": { + "groupBy": { + "expressions": [], + "type": "and" + }, + "reduce": { + "expressions": [], + "type": "and" + }, + "where": { + "expressions": [], + "type": "and" + } + }, + "hide": false, + "query": "WorkItems \n| where $__timeFilter(Queued)\n| project QueueName = tolower(QueueName), duration = datetime_diff('second', Started, Queued), Queued\n| where QueueName !contains \"osx\" and QueueName !contains \"perf\" and QueueName !contains \"arm\" and QueueName !contains \"arcade\" and QueueName !contains \"xaml\" and QueueName !contains \"appcompat\"\n| where QueueName !contains \"buildpool\"\n| summarize Percentile50 = percentile(duration, 50) / 60 by bin(Queued, 1d)\n| order by Queued asc ", + "querySource": "raw", + "rawMode": true, + "refId": "B", + "resultFormat": "time_series" + } + ], + "thresholds": [ + { + "colorMode": "critical", + "op": "gt", + "value": 35, + "visible": true + } + ], + "title": "Work Items Waiting Time (Test Queues)", + "type": "timeseries" + }, + { + "gridPos": { + "h": 13, + "w": 7, + "x": 17, + "y": 13 + }, + "id": 9, + "options": { + "alertName": "", + "dashboardAlerts": false, + "dashboardTitle": "", + "maxItems": 10, + "showOptions": "current", + "sortOrder": 1, + "stateFilter": { + "alerting": true, + "execution_error": false, + "no_data": false, + "ok": false, + "paused": false, + "pending": false + }, + "tags": [] + }, + "title": "Currently Active Alerts", + "type": "alertlist" + }, + { + "datasource": { + "type": "grafana-azure-data-explorer-datasource", + "uid": "OlcfOPi7z" + }, + "description": "This panel shows all the times the Helix client crashed while processing a work item. The goal for this is to be greater than 99.99% reliability.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMax": 1.0002, + "axisSoftMin": 0.9998, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "line" + } + }, + "decimals": 3, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "transparent", + "value": null + }, + { + "color": "red", + "value": 0.9999 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 9, + "x": 0, + "y": 17 + }, + "id": 11, + "options": { + "legend": { + "calcs": [], + "displayMode": "hidden", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + }, + "targets": [ + { + "database": "engineeringdata", + "datasource": { + "type": "grafana-azure-data-explorer-datasource", + "uid": "OlcfOPi7z" + }, + "expression": { + "groupBy": { + "expressions": [], + "type": "and" + }, + "reduce": { + "expressions": [], + "type": "and" + }, + "where": { + "expressions": [], + "type": "and" + } + }, + "query": "WorkItems\n| where Finished > ago(120d)\n| extend my=startofmonth(Finished)\n| summarize Pass=countif(Status != \"None\"), Fail=countif(Status == \"None\") by my\n| project my, PassPercent = toreal(Pass) / toreal(Pass + Fail)\n| order by my desc \n| limit 4\n| order by my asc", + "querySource": "raw", + "rawMode": true, + "refId": "A", + "resultFormat": "time_series" + } + ], + "title": "Number of Crashes in Helix", + "type": "timeseries" + } + ], + "schemaVersion": 34, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-4M", + "to": "now" + }, + "timepicker": { + "hidden": false, + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ], + "type": "timepicker" + }, + "timezone": "browser", + "title": "Home", + "uid": "home", + "weekStart": "" +} \ No newline at end of file diff --git a/src/Monitoring/Monitoring.ArcadeServices/datasource/Production/Deployment Annotations (Infinity).datasource.json b/src/Monitoring/Monitoring.ArcadeServices/datasource/Production/Deployment Annotations (Infinity).datasource.json new file mode 100644 index 000000000..2486df2a8 --- /dev/null +++ b/src/Monitoring/Monitoring.ArcadeServices/datasource/Production/Deployment Annotations (Infinity).datasource.json @@ -0,0 +1,16 @@ +{ + "uid": "deployment-annotations-infinity", + "name": "Deployment Annotations (Infinity)", + "type": "yesoreyeram-infinity-datasource", + "access": "proxy", + "url": "https://dotneteng-status.azurewebsites.net", + "jsonData": { + "tlsSkipVerify": false, + "httpHeaderName1": "Authorization" + }, + "secureJsonData": { + "httpHeaderValue1": "[vault(dotneteng-status-auth-header)]" + }, + "isDefault": false, + "readOnly": false +} diff --git a/src/Monitoring/Monitoring.ArcadeServices/datasource/Production/Deployment Annotations.datasource.json b/src/Monitoring/Monitoring.ArcadeServices/datasource/Production/Deployment Annotations.datasource.json deleted file mode 100644 index 393d88091..000000000 --- a/src/Monitoring/Monitoring.ArcadeServices/datasource/Production/Deployment Annotations.datasource.json +++ /dev/null @@ -1,25 +0,0 @@ -{ - "uid": "VrkJ-7W7z", - "type": "grafana-simple-json-datasource", - "typeLogoUrl": "", - "access": "proxy", - "url": "https://dotneteng-status.azurewebsites.net/api/annotations", - "password": "", - "user": "", - "database": "", - "basicAuth": false, - "basicAuthUser": "abcd", - "basicAuthPassword": "", - "withCredentials": false, - "isDefault": false, - "jsonData": { - "httpHeaderName1": "Authorization", - "tlsAuth": false, - "tlsSkipVerify": false - }, - "readOnly": false, - "secureJsonData": { - "basicAuthPassword": "", - "httpHeaderValue1": "[vault(dotneteng-status-auth-header)]" - } -} \ No newline at end of file diff --git a/src/Monitoring/Monitoring.ArcadeServices/datasource/Staging/Deployment Annotations (Infinity).datasource.json b/src/Monitoring/Monitoring.ArcadeServices/datasource/Staging/Deployment Annotations (Infinity).datasource.json new file mode 100644 index 000000000..37e92e155 --- /dev/null +++ b/src/Monitoring/Monitoring.ArcadeServices/datasource/Staging/Deployment Annotations (Infinity).datasource.json @@ -0,0 +1,16 @@ +{ + "uid": "deployment-annotations-infinity", + "name": "Deployment Annotations (Infinity)", + "type": "yesoreyeram-infinity-datasource", + "access": "proxy", + "url": "https://dotneteng-status-staging.azurewebsites.net", + "jsonData": { + "tlsSkipVerify": false, + "httpHeaderName1": "Authorization" + }, + "secureJsonData": { + "httpHeaderValue1": "[vault(dotneteng-status-auth-header)]" + }, + "isDefault": false, + "readOnly": false +} diff --git a/src/Monitoring/Monitoring.ArcadeServices/datasource/Staging/Deployment Annotations.datasource.json b/src/Monitoring/Monitoring.ArcadeServices/datasource/Staging/Deployment Annotations.datasource.json deleted file mode 100644 index acfad94b2..000000000 --- a/src/Monitoring/Monitoring.ArcadeServices/datasource/Staging/Deployment Annotations.datasource.json +++ /dev/null @@ -1,25 +0,0 @@ -{ - "uid": "VrkJ-7W7z", - "type": "grafana-simple-json-datasource", - "typeLogoUrl": "", - "access": "proxy", - "url": "https://dotneteng-status-staging.azurewebsites.net/api/annotations", - "password": "", - "user": "", - "database": "", - "basicAuth": false, - "basicAuthUser": "abcd", - "basicAuthPassword": "", - "withCredentials": false, - "isDefault": false, - "jsonData": { - "httpHeaderName1": "Authorization", - "tlsAuth": false, - "tlsSkipVerify": false - }, - "readOnly": false, - "secureJsonData": { - "basicAuthPassword": "", - "httpHeaderValue1": "[vault(dotneteng-status-auth-header)]" - } -} diff --git a/src/Monitoring/Monitoring.ArcadeServices/notifications/Production/statusHook.imageless.12h_reminder.notification.json b/src/Monitoring/Monitoring.ArcadeServices/notifications/Production/statusHook.imageless.12h_reminder.notification.json index 8c6d7d3e6..cd4031247 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/notifications/Production/statusHook.imageless.12h_reminder.notification.json +++ b/src/Monitoring/Monitoring.ArcadeServices/notifications/Production/statusHook.imageless.12h_reminder.notification.json @@ -1,16 +1,11 @@ { "name": ".NET Status Alert (no image, 12h reminder)", "type": "webhook", - "isDefault": false, - "sendReminder": true, "disableResolveMessage": false, - "frequency": "12h", "settings": { - "autoResolve": true, - "httpMethod": "POST", - "password": "[vault(dotnet-build-bot-dotnet-eng-status-token)]", - "uploadImage": false, "url": "https://dotneteng-status.azurewebsites.net/api/alert", - "username": "ignored" + "httpMethod": "POST", + "username": "ignored", + "password": "[vault(dotnet-build-bot-dotnet-eng-status-token)]" } } \ No newline at end of file diff --git a/src/Monitoring/Monitoring.ArcadeServices/notifications/Production/statusHook.imageless.notification.json b/src/Monitoring/Monitoring.ArcadeServices/notifications/Production/statusHook.imageless.notification.json deleted file mode 100644 index a337a1271..000000000 --- a/src/Monitoring/Monitoring.ArcadeServices/notifications/Production/statusHook.imageless.notification.json +++ /dev/null @@ -1,16 +0,0 @@ -{ - "name": ".NET Status Alert (no image)", - "type": "webhook", - "isDefault": false, - "sendReminder": false, - "disableResolveMessage": false, - "frequency": "", - "settings": { - "autoResolve": true, - "httpMethod": "POST", - "password": "[vault(dotnet-build-bot-dotnet-eng-status-token)]", - "uploadImage": false, - "url": "https://dotneteng-status.azurewebsites.net/api/alert", - "username": "ignored" - } -} \ No newline at end of file diff --git a/src/Monitoring/Monitoring.ArcadeServices/notifications/Production/statusHook.notification.json b/src/Monitoring/Monitoring.ArcadeServices/notifications/Production/statusHook.notification.json index 34e223ac2..5d1c2d143 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/notifications/Production/statusHook.notification.json +++ b/src/Monitoring/Monitoring.ArcadeServices/notifications/Production/statusHook.notification.json @@ -1,16 +1,11 @@ { "name": ".NET Status Alert", "type": "webhook", - "isDefault": false, - "sendReminder": false, "disableResolveMessage": false, - "frequency": "", "settings": { - "autoResolve": true, - "httpMethod": "POST", - "password": "[vault(dotnet-build-bot-dotnet-eng-status-token)]", - "uploadImage": false, "url": "https://dotneteng-status.azurewebsites.net/api/alert", - "username": "ignored" + "httpMethod": "POST", + "username": "ignored", + "password": "[vault(dotnet-build-bot-dotnet-eng-status-token)]" } } \ No newline at end of file diff --git a/src/Monitoring/Monitoring.ArcadeServices/notifications/Production/teamsHook.notification.json b/src/Monitoring/Monitoring.ArcadeServices/notifications/Production/teamsHook.notification.json index f65218b6c..b8f49b778 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/notifications/Production/teamsHook.notification.json +++ b/src/Monitoring/Monitoring.ArcadeServices/notifications/Production/teamsHook.notification.json @@ -1,14 +1,10 @@ { "name": "Teams Alert", "type": "teams", - "isDefault": false, - "sendReminder": false, "disableResolveMessage": false, - "frequency": "", "settings": { - "autoResolve": true, - "httpMethod": "POST", - "uploadImage": false, - "url": "[vault(fr-bot-notifications-teams-notification-url)]" + "url": "[vault(fr-bot-notifications-teams-notification-url)]", + "message": "{{ template \"teams.default.message\" . }}", + "title": "{{ template \"teams.default.title\" . }}" } } \ No newline at end of file diff --git a/src/Monitoring/Monitoring.ArcadeServices/notifications/Staging/statusHook.imageless.12h_reminder.notification.json b/src/Monitoring/Monitoring.ArcadeServices/notifications/Staging/statusHook.imageless.12h_reminder.notification.json index 09aeb8713..875a5780c 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/notifications/Staging/statusHook.imageless.12h_reminder.notification.json +++ b/src/Monitoring/Monitoring.ArcadeServices/notifications/Staging/statusHook.imageless.12h_reminder.notification.json @@ -1,16 +1,11 @@ { "name": ".NET Status Alert (no image, 12h reminder)", "type": "webhook", - "isDefault": false, - "sendReminder": true, "disableResolveMessage": false, - "frequency": "12h", "settings": { - "autoResolve": true, - "httpMethod": "POST", - "password": "[vault(dotnet-build-bot-dotnet-eng-status-token)]", - "uploadImage": false, "url": "https://dotneteng-status-staging.azurewebsites.net/api/alert", - "username": "ignored" + "httpMethod": "POST", + "username": "ignored", + "password": "[vault(dotnet-build-bot-dotnet-eng-status-token)]" } } \ No newline at end of file diff --git a/src/Monitoring/Monitoring.ArcadeServices/notifications/Staging/statusHook.imageless.notification.json b/src/Monitoring/Monitoring.ArcadeServices/notifications/Staging/statusHook.imageless.notification.json deleted file mode 100644 index e677ea1ab..000000000 --- a/src/Monitoring/Monitoring.ArcadeServices/notifications/Staging/statusHook.imageless.notification.json +++ /dev/null @@ -1,16 +0,0 @@ -{ - "name": ".NET Status Alert (no image)", - "type": "webhook", - "isDefault": false, - "sendReminder": false, - "disableResolveMessage": false, - "frequency": "", - "settings": { - "autoResolve": true, - "httpMethod": "POST", - "password": "[vault(dotnet-build-bot-dotnet-eng-status-token)]", - "uploadImage": false, - "url": "https://dotneteng-status-staging.azurewebsites.net/api/alert", - "username": "ignored" - } -} \ No newline at end of file diff --git a/src/Monitoring/Monitoring.ArcadeServices/notifications/Staging/statusHook.notification.json b/src/Monitoring/Monitoring.ArcadeServices/notifications/Staging/statusHook.notification.json index 61b1b74ce..895db63b7 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/notifications/Staging/statusHook.notification.json +++ b/src/Monitoring/Monitoring.ArcadeServices/notifications/Staging/statusHook.notification.json @@ -1,16 +1,11 @@ { "name": ".NET Status Alert", "type": "webhook", - "isDefault": false, - "sendReminder": false, "disableResolveMessage": false, - "frequency": "", "settings": { - "autoResolve": true, - "httpMethod": "POST", - "password": "[vault(dotnet-build-bot-dotnet-eng-status-token)]", - "uploadImage": false, "url": "https://dotneteng-status-staging.azurewebsites.net/api/alert", - "username": "ignored" + "httpMethod": "POST", + "username": "ignored", + "password": "[vault(dotnet-build-bot-dotnet-eng-status-token)]" } } \ No newline at end of file diff --git a/src/Monitoring/Monitoring.ArcadeServices/notifications/Staging/teamsHook.notification.json b/src/Monitoring/Monitoring.ArcadeServices/notifications/Staging/teamsHook.notification.json index f65218b6c..b8f49b778 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/notifications/Staging/teamsHook.notification.json +++ b/src/Monitoring/Monitoring.ArcadeServices/notifications/Staging/teamsHook.notification.json @@ -1,14 +1,10 @@ { "name": "Teams Alert", "type": "teams", - "isDefault": false, - "sendReminder": false, "disableResolveMessage": false, - "frequency": "", "settings": { - "autoResolve": true, - "httpMethod": "POST", - "uploadImage": false, - "url": "[vault(fr-bot-notifications-teams-notification-url)]" + "url": "[vault(fr-bot-notifications-teams-notification-url)]", + "message": "{{ template \"teams.default.message\" . }}", + "title": "{{ template \"teams.default.title\" . }}" } } \ No newline at end of file diff --git a/src/Monitoring/Monitoring.ArcadeServices/parameters.json b/src/Monitoring/Monitoring.ArcadeServices/parameters.json index 01cc7c57f..7e8541b4b 100644 --- a/src/Monitoring/Monitoring.ArcadeServices/parameters.json +++ b/src/Monitoring/Monitoring.ArcadeServices/parameters.json @@ -161,7 +161,7 @@ } }, { - "Name" : "product-construction-service-workspace-resourcename", + "Name": "product-construction-service-workspace-resourcename", "Values": { "Staging": "product-construction-service-workspace-int", "Production": "product-construction-service-workspace-prod" @@ -173,5 +173,26 @@ "Staging": "e6b5f9f5-0ca4-4351-879b-014d78400ec2", "Production": "fbd6122a-9ad3-42e4-976e-bccb82486856" } + }, + { + "Name": "home-dashboard-uid", + "Values": { + "Staging": "home", + "Production": "home" + } + }, + { + "Name": "arcade-services-folderid", + "Values": { + "Staging": "46", + "Production": "37" + } + }, + { + "Name": "helix-service-folderid", + "Values": { + "Staging": "92", + "Production": "41" + } } ] \ No newline at end of file diff --git a/src/Monitoring/Sdk/DeployPublisher.cs b/src/Monitoring/Sdk/DeployPublisher.cs index 854da03f2..e2d8b31b2 100644 --- a/src/Monitoring/Sdk/DeployPublisher.cs +++ b/src/Monitoring/Sdk/DeployPublisher.cs @@ -52,6 +52,22 @@ public DeployPublisher( private string EnvironmentDatasourceDirectory => Path.Combine(DatasourceDirectory, _environment); private string EnvironmentNotificationDirectory => Path.Combine(NotificationDirectory, _environment); + private string AlertRuleDirectory + { + get + { + string baseDir = Path.Combine(Path.GetDirectoryName(NotificationDirectory), "alertrules"); + string environmentSpecificDir = Path.Combine(baseDir, _environment); + + // If environment-specific folder exists, use it; otherwise fall back to base directory + if (Directory.Exists(environmentSpecificDir)) + { + return environmentSpecificDir; + } + + return baseDir; + } + } public void Dispose() { @@ -62,9 +78,13 @@ public async Task PostToGrafanaAsync() { await PostDatasourcesAsync().ConfigureAwait(false); - await PostNotificationsAsync().ConfigureAwait(false); + await PostContactPointsAsync().ConfigureAwait(false); + + await PostAlertRulesAsync().ConfigureAwait(false); await PostDashboardsAsync().ConfigureAwait(false); + + await SetHomeDashboardAsync().ConfigureAwait(false); } private async Task PostDatasourcesAsync() @@ -115,6 +135,91 @@ private async Task PostNotificationsAsync() } } + private async Task PostContactPointsAsync() + { + // Check if notification directory exists (optional feature) + if (!Directory.Exists(EnvironmentNotificationDirectory)) + { + Log.LogMessage(MessageImportance.Low, "No notification directory found at {0}, skipping contact points", EnvironmentNotificationDirectory); + return; + } + + foreach (string notificationPath in Directory.GetFiles(EnvironmentNotificationDirectory, + "*" + NotificationExtension, + SearchOption.AllDirectories)) + { + JObject data; + using (var sr = new StreamReader(notificationPath)) + using (var jr = new JsonTextReader(sr)) + { + data = await JObject.LoadAsync(jr).ConfigureAwait(false); + } + + string name = data.Value("name"); + Log.LogMessage(MessageImportance.Normal, "Posting contact point {0}...", name); + + await ReplaceVaultAsync(data); + + await GrafanaClient.CreateContactPointAsync(data).ConfigureAwait(false); + } + } + + private async Task PostAlertRulesAsync() + { + // Check if alert rules directory exists + if (!Directory.Exists(AlertRuleDirectory)) + { + Log.LogMessage(MessageImportance.Low, "No alert rules directory found at {0}, skipping alert rules", AlertRuleDirectory); + return; + } + + Log.LogMessage(MessageImportance.High, "Loading parameters from: {0}", Path.GetFullPath(_parameterFile)); + Log.LogMessage(MessageImportance.High, "Parameters file exists: {0}", File.Exists(_parameterFile)); + + // Load parameters for deparameterization + List parameters; + using (StreamReader sr = new StreamReader(_parameterFile)) + using (JsonReader jr = new JsonTextReader(sr)) + { + JsonSerializer jsonSerializer = new JsonSerializer(); + parameters = jsonSerializer.Deserialize>(jr); + } + + if (parameters == null || parameters.Count == 0) + { + Log.LogError("Failed to load parameters from {0}", _parameterFile); + return; + } + + Log.LogMessage(MessageImportance.High, "Loaded {0} parameters from {1}", parameters.Count, _parameterFile); + + foreach (string alertRulePath in Directory.GetFiles(AlertRuleDirectory, + "*" + AlertRuleExtension, + SearchOption.AllDirectories)) + { + JObject data; + using (var sr = new StreamReader(alertRulePath)) + using (var jr = new JsonTextReader(sr)) + { + data = await JObject.LoadAsync(jr).ConfigureAwait(false); + } + + string uid = data.Value("uid"); + string title = data.Value("title"); + Log.LogMessage(MessageImportance.Normal, "Posting alert rule {0} ({1})...", uid, title); + + // Replace [parameter(...)] placeholders with environment-specific values + data = GrafanaSerialization.DeparameterizeDashboard(data, parameters, _environment); + + // Log the final JSON for debugging + Log.LogMessage(MessageImportance.High, "Alert JSON after parameter replacement: {0}", data.ToString(Formatting.Indented)); + + await ReplaceVaultAsync(data); + + await GrafanaClient.CreateAlertRuleAsync(data).ConfigureAwait(false); + } + } + private async Task PostDashboardsAsync() { JArray folderArray = await GrafanaClient.ListFoldersAsync().ConfigureAwait(false); @@ -279,4 +384,40 @@ private SecretClient GetKeyVaultClient() Uri vaultUri = new($"https://{_keyVaultName}.vault.azure.net/"); return new SecretClient(vaultUri, _tokenCredential); } + + private async Task SetHomeDashboardAsync() + { + // Load parameters to get home dashboard UID + List parameters; + using (StreamReader sr = new StreamReader(_parameterFile)) + using (JsonReader jr = new JsonTextReader(sr)) + { + JsonSerializer jsonSerializer = new JsonSerializer(); + parameters = jsonSerializer.Deserialize>(jr); + } + + if (parameters == null) + { + Log.LogMessage(MessageImportance.Normal, "No parameters file found, skipping home dashboard configuration"); + return; + } + + // Find the home-dashboard-uid parameter + var homeDashboardParam = parameters.FirstOrDefault(p => p.Name == "home-dashboard-uid"); + if (homeDashboardParam == null || !homeDashboardParam.Values.TryGetValue(_environment, out string dashboardUid)) + { + Log.LogMessage(MessageImportance.Normal, "No home-dashboard-uid parameter found for environment {0}, skipping home dashboard configuration", _environment); + return; + } + + if (string.IsNullOrWhiteSpace(dashboardUid)) + { + Log.LogMessage(MessageImportance.Normal, "Home dashboard UID is empty, skipping home dashboard configuration"); + return; + } + + Log.LogMessage(MessageImportance.Normal, "Setting home dashboard to: {0}", dashboardUid); + await GrafanaClient.SetHomeDashboardAsync(dashboardUid).ConfigureAwait(false); + Log.LogMessage(MessageImportance.Normal, "Successfully set home dashboard"); + } } diff --git a/src/Monitoring/Sdk/DeployToolBase.cs b/src/Monitoring/Sdk/DeployToolBase.cs index fa525f9ad..7ad56be66 100644 --- a/src/Monitoring/Sdk/DeployToolBase.cs +++ b/src/Monitoring/Sdk/DeployToolBase.cs @@ -12,6 +12,7 @@ public abstract class DeployToolBase protected const string DashboardExtension = ".dashboard.json"; protected const string DatasourceExtension = ".datasource.json"; protected const string NotificationExtension = ".notification.json"; + protected const string AlertRuleExtension = ".alert.json"; protected const string BaseUidTagPrefix = "baseuid:"; protected const string SourceTagPrefix = "source:"; @@ -82,4 +83,9 @@ protected static string GetNameFromDatasourceFile(string fileName) { return fileName.Substring(0, fileName.Length - DatasourceExtension.Length); } + + protected static string GetUidFromAlertRuleFile(string fileName) + { + return fileName.Substring(0, fileName.Length - AlertRuleExtension.Length); + } } diff --git a/src/Monitoring/Sdk/GrafanaClient.cs b/src/Monitoring/Sdk/GrafanaClient.cs index 16103b192..74e2dcaaf 100644 --- a/src/Monitoring/Sdk/GrafanaClient.cs +++ b/src/Monitoring/Sdk/GrafanaClient.cs @@ -130,26 +130,31 @@ public async Task GetDataSourceByUidAsync(string uid) } } - public Task CreateFolderAsync(string uid, string title) + public async Task CreateFolderAsync(string uid, string title) { + // First try to get the folder - if it exists, just return it + var getUri = new Uri(new Uri(_baseUrl), $"/api/folders/{Uri.EscapeDataString(uid)}"); + using (HttpResponseMessage getResponse = await _client.GetAsync(getUri).ConfigureAwait(false)) + { + if (getResponse.IsSuccessStatusCode) + { + using (Stream stream = await getResponse.Content.ReadAsStreamAsync().ConfigureAwait(false)) + using (var streamReader = new StreamReader(stream)) + using (var jsonReader = new JsonTextReader(streamReader)) + { + return await JObject.LoadAsync(jsonReader).ConfigureAwait(false); + } + } + } + + // Folder doesn't exist, create it var folder = new JObject { {"uid", uid}, {"title", title}, }; - return CreateOrUpdateAsync( - folder, - folder.Value("uid"), - u => $"/api/folders/{Uri.EscapeDataString(u)}", - "/api/folders", - _ => (HttpMethod.Put, $"/api/folders/{uid}"), - (d, x) => - { - d.Remove("uid"); - d["version"] = x.Value("version"); - } - ); + return await SendObjectAsync(folder, new Uri(new Uri(_baseUrl), "/api/folders")).ConfigureAwait(false); } public Task CreateDatasourceAsync(JObject datasource) @@ -185,6 +190,55 @@ public Task CreateNotificationChannelAsync(JObject notificationChannel) ); } + public async Task CreateContactPointAsync(JObject contactPoint) + { + string name = contactPoint.Value("name"); + + // List all contact points to find if one with this name already exists + var listUri = new Uri(new Uri(_baseUrl), "/api/v1/provisioning/contact-points"); + + using (HttpResponseMessage listResponse = await _client.GetAsync(listUri).ConfigureAwait(false)) + { + await listResponse.EnsureSuccessWithContentAsync(); + + JArray allContactPoints; + using (Stream stream = await listResponse.Content.ReadAsStreamAsync().ConfigureAwait(false)) + using (var streamReader = new StreamReader(stream)) + using (var jsonReader = new JsonTextReader(streamReader)) + { + allContactPoints = await JArray.LoadAsync(jsonReader).ConfigureAwait(false); + } + + // Find existing contact point by name + JObject existing = null; + foreach (JToken item in allContactPoints) + { + if (item is JObject cp && cp.Value("name") == name) + { + existing = cp; + break; + } + } + + if (existing != null) + { + // Update existing contact point using UID + string existingUid = existing.Value("uid"); + var updateUri = new Uri(new Uri(_baseUrl), $"/api/v1/provisioning/contact-points/{Uri.EscapeDataString(existingUid)}"); + + // Preserve the existing uid + contactPoint["uid"] = existingUid; + + await SendObjectAsync(contactPoint, updateUri, HttpMethod.Put).ConfigureAwait(false); + return; + } + } + + // Create new contact point using POST only if not found + var createUri = new Uri(new Uri(_baseUrl), "/api/v1/provisioning/contact-points"); + await SendObjectAsync(contactPoint, createUri, HttpMethod.Post).ConfigureAwait(false); + } + private async Task CreateOrUpdateAsync( JObject data, TExternalId id, @@ -319,6 +373,88 @@ public async Task GetNotificationChannelAsync(string uid) } } + public async Task GetContactPointAsync(string name) + { + var uri = new Uri(new Uri(_baseUrl), $"/api/v1/provisioning/contact-points/{Uri.EscapeDataString(name)}"); + + using (HttpResponseMessage response = await _client.GetAsync(uri).ConfigureAwait(false)) + { + if (response.StatusCode == HttpStatusCode.NotFound) + return null; + + await response.EnsureSuccessWithContentAsync(); + + using (Stream stream = await response.Content.ReadAsStreamAsync().ConfigureAwait(false)) + using (var streamReader = new StreamReader(stream)) + using (var jsonReader = new JsonTextReader(streamReader)) + { + return await JObject.LoadAsync(jsonReader).ConfigureAwait(false); + } + } + } + + public async Task CreateAlertRuleAsync(JObject alertRule) + { + string uid = alertRule.Value("uid"); + + // Check if alert rule already exists + var getUri = new Uri(new Uri(_baseUrl), $"/api/v1/provisioning/alert-rules/{Uri.EscapeDataString(uid)}"); + + using (HttpResponseMessage existCheck = await _client.GetAsync(getUri).ConfigureAwait(false)) + { + if (existCheck.StatusCode == HttpStatusCode.NotFound) + { + // Create new alert rule + var createUri = new Uri(new Uri(_baseUrl), "/api/v1/provisioning/alert-rules"); + await SendObjectAsync(alertRule, createUri, HttpMethod.Post).ConfigureAwait(false); + } + else + { + // Update existing alert rule + await existCheck.EnsureSuccessWithContentAsync(); + + // Get existing version and provenance + using (Stream stream = await existCheck.Content.ReadAsStreamAsync().ConfigureAwait(false)) + using (var streamReader = new StreamReader(stream)) + using (var jsonReader = new JsonTextReader(streamReader)) + { + JObject existing = await JObject.LoadAsync(jsonReader).ConfigureAwait(false); + + // Preserve id and updated timestamp + if (existing.TryGetValue("id", out JToken idToken)) + alertRule["id"] = idToken; + if (existing.TryGetValue("updated", out JToken updatedToken)) + alertRule["updated"] = updatedToken; + } + + var updateUri = new Uri(new Uri(_baseUrl), $"/api/v1/provisioning/alert-rules/{Uri.EscapeDataString(uid)}"); + await SendObjectAsync(alertRule, updateUri, HttpMethod.Put).ConfigureAwait(false); + } + } + } + + public async Task SetHomeDashboardAsync(string dashboardUid) + { + // Set organization preferences (home dashboard and timezone) + var preferences = new JObject + { + {"homeDashboardUID", dashboardUid}, + {"timezone", "browser"} + }; + + var preferencesUri = new Uri(new Uri(_baseUrl), "/api/org/preferences"); + await SendObjectAsync(preferences, preferencesUri, HttpMethod.Put).ConfigureAwait(false); + + // Set organization name + var orgDetails = new JObject + { + {"name", ".NET Engineering Services"} + }; + + var orgUri = new Uri(new Uri(_baseUrl), "/api/org"); + await SendObjectAsync(orgDetails, orgUri, HttpMethod.Put).ConfigureAwait(false); + } + public void Dispose() { _client?.Dispose(); diff --git a/src/Monitoring/Sdk/GrafanaSerialization.cs b/src/Monitoring/Sdk/GrafanaSerialization.cs index d74c992e6..1f095c61d 100644 --- a/src/Monitoring/Sdk/GrafanaSerialization.cs +++ b/src/Monitoring/Sdk/GrafanaSerialization.cs @@ -211,7 +211,17 @@ public static JObject DeparameterizeDashboard(JObject dashboard, IEnumerable ExecuteAsync() } catch (HttpRequestException e) { - Log.LogErrorFromException(e, showStackTrace: false, showDetail: false, file: "MonitoringPublish"); + Log.LogErrorFromException(e, showStackTrace: true, showDetail: true, file: "MonitoringPublish"); + return false; + } + catch (System.Exception e) + { + Log.LogErrorFromException(e, showStackTrace: true, showDetail: true, file: "MonitoringPublish"); return false; } }