diff --git a/root/values.yaml b/root/values.yaml index 9cdf78ba..f3bf77e6 100644 --- a/root/values.yaml +++ b/root/values.yaml @@ -168,10 +168,12 @@ apps: valuesFile: ../values_cf.yaml syncWave: -3 otel-lgtm-stack: - path: otel-lgtm-stack + path: otel-lgtm-stack/v1.0.7 namespace: otel-lgtm-stack - directory: - recurse: true + valuesFile: ../values_cf.yaml + helmParameters: + - name: cluster.name + value: "{{ .Values.global.domain }}" syncWave: -2 # Databases cnpg-operator: diff --git a/sbom/components.yaml b/sbom/components.yaml index 236cc828..0de74ce3 100644 --- a/sbom/components.yaml +++ b/sbom/components.yaml @@ -79,7 +79,8 @@ components: license: Apache License 2.0 licenseUrl: https://github.com/open-telemetry/opentelemetry-operator/blob/main/LICENSE otel-lgtm-stack: - path: otel-lgtm-stack + path: otel-lgtm-stack/v1.0.7 + valuesFile: ../values_cf.yaml sourceUrl: https://github.com/silogen/docker-otel-lgtm projectUrl: https://github.com/grafana/docker-otel-lgtm license: Apache License 2.0 @@ -187,10 +188,16 @@ components: projectUrl: https://github.com/minio/operator license: GNU Affero General Public License v3.0 licenseUrl: https://github.com/minio/operator/blob/master/LICENSE + aim-cluster-model-source: # This will be part of kaiwo-crds in near future + path: aim-cluster-model-source + sourceUrl: https://github.com/silogen/kaiwo/releases/download/v0.2.0-rc11/crds.yaml + projectUrl: https://github.com/silogen/kaiwo/ + license: MIT License + licenseUrl: https://github.com/silogen/kaiwo/blob/main/LICENSE kaiwo-crds: path: kaiwo-crds/v0.2.0-rc11 sourceUrl: https://github.com/silogen/kaiwo/releases/download/v0.2.0-rc11/crds.yaml - projectUrl: //github.com/silogen/kaiwo/ + projectUrl: https://github.com/silogen/kaiwo/ license: MIT License licenseUrl: https://github.com/silogen/kaiwo/blob/main/LICENSE kaiwo: diff --git a/sources/otel-lgtm-stack/README.md b/sources/otel-lgtm-stack/README.md index 9c23301e..73c955f6 100644 --- a/sources/otel-lgtm-stack/README.md +++ b/sources/otel-lgtm-stack/README.md @@ -1,56 +1,86 @@ -# Required tools +# OpenTelemetry LGTM Stack + +A comprehensive observability stack providing Logs, Grafana, Tempo, and Mimir (LGTM) for Kubernetes clusters using OpenTelemetry. + +## Required Tools + - cert-manager - opentelemetry-operator - prometheus-crds - node-exporter - kube-state-metrics -# This tool consists of -- otel-collectors for metrics and logs -- otel-lgtm +## This Tool Consists Of + +- **OpenTelemetry Collectors** - Metrics, logs, and events collection +- **LGTM Stack** - Integrated Loki, Grafana, Tempo, Mimir observability platform +- **Auto-instrumentation** - Support for .NET, Go, Java, Node.js, Python applications +- **Kubernetes Monitoring** - Node and cluster-level metrics collection -# How this otel-collector manifests created -There are two otel-collectors for lgtm-stack, otel-collector-metrics and otel-collector-logs +## How This OpenTelemetry Collector Manifests Created +### Metrics Collection +Node-level and cluster-level metrics are collected by dedicated collectors: +- **Node Exporter** and **Kube State Metrics** expose metrics endpoints +- **otel-collector-metrics** pods (deployment) scrape configured endpoints +- Control collection via `scrape_configs` and pod annotations: + - `prometheus.io/scrape: 'true'` + - `prometheus.io/path: '/metrics'` -Node level and cluster level metrics are collected and exposed by "node-exporter" -and "kube-state-metrics", respectively. Otel-collector-metrics pod is a dedicated collector -controlled by deployment to scrape metrics. So "scrape_configs" and -giving annotation like "prometheus.io/scrape: 'true'", prometheus.io/path: '/metrics' to pods -is the key to control what metrics should be scraped. -ref: https://grafana.com/docs/grafana-cloud/monitor-infrastructure/kubernetes-monitoring/configuration/helm-chart-config/otel-collector/ +Reference: [Grafana Kubernetes Monitoring Guide](https://grafana.com/docs/grafana-cloud/monitor-infrastructure/kubernetes-monitoring/configuration/helm-chart-config/otel-collector/) -Node level and cluster level logs are collected by "otel-collector-logs" pods which are -controlled by daemonset to collect logs. +### Logs Collection +- **otel-collector-logs** pods (daemonset) collect container logs cluster-wide +- **otel-collector-logs-events** (deployment) collects Kubernetes events +- Based on modified [openobserve-collector](https://github.com/openobserve/openobserve-collector) manifests -This otel-collector-logs manifest is created from the modification of openobserve-collector manifests. +### Auto-Instrumentation +Pre-configured instrumentation resources for automatic telemetry injection. Applications can enable auto-instrumentation by adding pod annotations. -Current instrumentations are configured to send telemetries to the endpoint of otel-lgtm. -Users/Developers who want to use auto instrumation need to implement by giving an annotation to their pods. +## Source of OTEL-LGTM Stack -# Source of otel-lgtm stack -- https://github.com/grafana/docker-otel-lgtm/tree/main +- **Silogen Fork**: [silogen/docker-otel-lgtm](https://github.com/silogen/docker-otel-lgtm) +- **Upstream**: [grafana/docker-otel-lgtm](https://github.com/grafana/docker-otel-lgtm/tree/main) +- **Version**: v1.0.7 +- **Image**: `ghcr.io/silogen/docker-otel-lgtm:v1.0.7` -# How to access the grafana of lgtm +## How to Access Grafana + +```bash kubectl port-forward -n otel-lgtm-stack service/lgtm-stack 3000:3000 4317:4317 4318:4318 +``` -id/password of grafana: admin/admin +**Default Credentials**: admin/admin -# Simple use case(Log) -1. Login -2. Go to explore -3. Select "Loki" datasource -4. Use label filters +**Access URLs**: +- Grafana: http://localhost:3000 +- OTLP gRPC: http://localhost:4317 +- OTLP HTTP: http://localhost:4318 -# Disclaimer -[docker-otel-lgtm](https://github.com/grafana/docker-otel-lgtm/tree/main/docker) is added Cluster-Forge for development, demo, and testing. The only changed part is as follows at "otelcol-config.yaml" to make a custom image. We don't manage/develop this. -``` +### Simple Log Exploration +1. Login to Grafana +2. Navigate to **Explore** +3. Select **Loki** datasource +4. Use label filters to query logs + +## Architecture & Management + +**LGTM Stack Image**: We maintain a [silogen/docker-otel-lgtm](https://github.com/silogen/docker-otel-lgtm) fork specifically for managing and updating the LGTM container image. This fork is based on the upstream [grafana/docker-otel-lgtm](https://github.com/grafana/docker-otel-lgtm) project. + +**Kubernetes Resources**: All Kubernetes manifests (OpenTelemetry Collectors, RBAC, instrumentation, etc.) are managed through our custom Helm chart stored in Cluster-Forge, **not** from the upstream docker-otel-lgtm repository. + +**Custom LGTM Image Modifications**: Our fork includes the following changes in `otelcol-config.yaml`: + +```yaml receivers: otlp: protocols: grpc: endpoint: 0.0.0.0:4317 - max_recv_msg_size_mib: 128 <-- added + max_recv_msg_size_mib: 128 # <-- added ``` - +**Scope of Management**: +- 🔧 **Image Management**: Silogen maintains the docker-otel-lgtm fork for LGTM container updates +- 📦 **Resource Management**: All OpenTelemetry collectors, RBAC, and Kubernetes resources are managed via this Cluster-Forge Helm chart +- 🎯 **Integration**: The chart integrates the custom LGTM image with our OpenTelemetry collector architecture diff --git a/sources/otel-lgtm-stack/dashboards.yaml b/sources/otel-lgtm-stack/dashboards.yaml new file mode 100644 index 00000000..951fccf5 --- /dev/null +++ b/sources/otel-lgtm-stack/dashboards.yaml @@ -0,0 +1,5 @@ +# This file contains dashboard ConfigMaps with Prometheus template expressions +# It's kept outside templates/ to avoid Helm templating conflicts +# ArgoCD can deploy this file directly alongside the Helm chart + +# This approach will be used if the raw block method doesn't work with ArgoCD \ No newline at end of file diff --git a/sources/otel-lgtm-stack/kube-state-metrics/kube-state-metrics-values.yaml b/sources/otel-lgtm-stack/kube-state-metrics/kube-state-metrics-values.yaml deleted file mode 100644 index ac395ac7..00000000 --- a/sources/otel-lgtm-stack/kube-state-metrics/kube-state-metrics-values.yaml +++ /dev/null @@ -1,202 +0,0 @@ -# Default values for kube-state-metrics. -prometheusScrape: true - -replicas: 1 - -# Change the deployment strategy when autosharding is disabled. -# ref: https://kubernetes.io/docs/concepts/workloads/controllers/deployment/#strategy -# The default is "RollingUpdate" as per Kubernetes defaults. -# During a release, 'RollingUpdate' can lead to two running instances for a short period of time while 'Recreate' can create a small gap in data. -# updateStrategy: Recreate - -# Number of old history to retain to allow rollback -# Default Kubernetes value is set to 10 -revisionHistoryLimit: 10 - -# List of additional cli arguments to configure kube-state-metrics -# for example: --enable-gzip-encoding, --log-file, etc. -# all the possible args can be found here: https://github.com/kubernetes/kube-state-metrics/blob/master/docs/cli-arguments.md -extraArgs: [] - -# If false then the user will opt out of automounting API credentials. -automountServiceAccountToken: true - -service: - port: 8080 - # Default to clusterIP for backward compatibility - type: ClusterIP - ipDualStack: - enabled: false - ipFamilies: ["IPv6", "IPv4"] - ipFamilyPolicy: "PreferDualStack" - nodePort: 0 - loadBalancerIP: "" - # Only allow access to the loadBalancerIP from these IPs - loadBalancerSourceRanges: [] - clusterIP: "" - annotations: {} - -## Additional labels to add to all resources -customLabels: {} - # app: kube-state-metrics - -## Override selector labels -selectorOverride: {} - -## set to true to add the release label so scraping of the servicemonitor with kube-prometheus-stack works out of the box -releaseLabel: false - -hostNetwork: false - -rbac: - # If true, create & use RBAC resources - create: true - - # Set to a rolename to use existing role - skipping role creating - but still doing serviceaccount and rolebinding to it, rolename set here. - # useExistingRole: your-existing-role - - # If set to false - Run without Cluteradmin privs needed - ONLY works if namespace is also set (if useExistingRole is set this name is used as ClusterRole or Role to bind to) - useClusterRole: true - - # Add permissions for CustomResources' apiGroups in Role/ClusterRole. Should be used in conjunction with Custom Resource State Metrics configuration - # Example: - # - apiGroups: ["monitoring.coreos.com"] - # resources: ["prometheuses"] - # verbs: ["list", "watch"] - extraRules: [] - - -serviceAccount: - # Specifies whether a ServiceAccount should be created, require rbac true - create: true - # The name of the ServiceAccount to use. - # If not set and create is true, a name is generated using the fullname template - name: - # Reference to one or more secrets to be used when pulling images - # ref: https://kubernetes.io/docs/tasks/configure-pod-container/pull-image-private-registry/ - imagePullSecrets: [] - # ServiceAccount annotations. - # Use case: AWS EKS IAM roles for service accounts - # ref: https://docs.aws.amazon.com/eks/latest/userguide/specify-service-account-role.html - annotations: {} - # If false then the user will opt out of automounting API credentials. - automountServiceAccountToken: true - -# Additional Environment variables -env: {} - # - name: GOMAXPROCS - # valueFrom: - # resourceFieldRef: - # resource: limits.cpu - - -## Specify if a Pod Security Policy for kube-state-metrics must be created -## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/ -## -podSecurityPolicy: - enabled: false - annotations: {} - ## Specify pod annotations - ## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/#apparmor - ## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/#seccomp - ## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/#sysctl - ## - # seccomp.security.alpha.kubernetes.io/allowedProfileNames: '*' - # seccomp.security.alpha.kubernetes.io/defaultProfileName: 'docker/default' - # apparmor.security.beta.kubernetes.io/defaultProfileName: 'runtime/default' - - additionalVolumes: [] - -## Configure network policy for kube-state-metrics -networkPolicy: - enabled: false - # networkPolicy.flavor -- Flavor of the network policy to use. - # Can be: - # * kubernetes for networking.k8s.io/v1/NetworkPolicy - # * cilium for cilium.io/v2/CiliumNetworkPolicy - flavor: kubernetes - - ## Configure the cilium network policy kube-apiserver selector - # cilium: - # kubeApiServerSelector: - # - toEntities: - # - kube-apiserver - - # egress: - # - {} - # ingress: - # - {} - # podSelector: - # matchLabels: - # app.kubernetes.io/name: kube-state-metrics - -securityContext: - enabled: true - runAsGroup: 65534 - runAsUser: 65534 - fsGroup: 65534 - runAsNonRoot: true - seccompProfile: - type: RuntimeDefault - -## Specify security settings for a Container -## Allows overrides and additional options compared to (Pod) securityContext -## Ref: https://kubernetes.io/docs/tasks/configure-pod-container/security-context/#set-the-security-context-for-a-container -containerSecurityContext: - readOnlyRootFilesystem: true - allowPrivilegeEscalation: false - capabilities: - drop: - - ALL - -# Comma-separated list of additional Kubernetes label keys that will be used in the resource's -# labels metric. By default the metric contains only name and namespace labels. -# To include additional labels, provide a list of resource names in their plural form and Kubernetes -# label keys you would like to allow for them (Example: '=namespaces=[k8s-label-1,k8s-label-n,...],pods=[app],...)'. -# A single '*' can be provided per resource instead to allow any labels, but that has -# severe performance implications (Example: '=pods=[*]'). -metricLabelsAllowlist: [] - # - namespaces=[k8s-label-1,k8s-label-n] - -# Comma-separated list of Kubernetes annotations keys that will be used in the resource' -# labels metric. By default the metric contains only name and namespace labels. -# To include additional annotations provide a list of resource names in their plural form and Kubernetes -# annotation keys you would like to allow for them (Example: '=namespaces=[kubernetes.io/team,...],pods=[kubernetes.io/team],...)'. -# A single '*' can be provided per resource instead to allow any annotations, but that has -# severe performance implications (Example: '=pods=[*]'). -metricAnnotationsAllowList: [] - # - pods=[k8s-annotation-1,k8s-annotation-n] - -# Available collectors for kube-state-metrics. -# By default, all available resources are enabled, comment out to disable. -collectors: - - certificatesigningrequests - - configmaps - - cronjobs - - daemonsets - - deployments - - endpoints - - horizontalpodautoscalers - - ingresses - - jobs - - leases - - limitranges - - mutatingwebhookconfigurations - - namespaces - - networkpolicies - - nodes - - persistentvolumeclaims - - persistentvolumes - - poddisruptionbudgets - - pods - - replicasets - - replicationcontrollers - - resourcequotas - - secrets - - services - - statefulsets - - storageclasses - - validatingwebhookconfigurations - - volumeattachments - - diff --git a/sources/otel-lgtm-stack/v1.0.7/Chart.yaml b/sources/otel-lgtm-stack/v1.0.7/Chart.yaml new file mode 100644 index 00000000..390773b6 --- /dev/null +++ b/sources/otel-lgtm-stack/v1.0.7/Chart.yaml @@ -0,0 +1,17 @@ +apiVersion: v2 +name: otel-lgtm-stack +description: A Helm chart for OpenTelemetry LGTM (Loki, Grafana, Tempo, Mimir) stack + +# A chart can be either an 'application' or a 'library' chart. +type: application + +# This is the chart version. This version number should be incremented each time you make changes +# to the chart and its templates, including the app version. +# Versions are expected to follow Semantic Versioning (https://semver.org/) +version: 1.0.7 + +# This is the version number of the application being deployed. This version number should be +# incremented each time you make changes to the application. Versions are not expected to +# follow Semantic Versioning. They should reflect the version the application is using. +# It is recommended to use it with quotes. +appVersion: "1.0.7" \ No newline at end of file diff --git a/sources/otel-lgtm-stack/node-exporter/chrony-node-exporter.yaml b/sources/otel-lgtm-stack/v1.0.7/templates/chrony-node-exporter.yaml similarity index 92% rename from sources/otel-lgtm-stack/node-exporter/chrony-node-exporter.yaml rename to sources/otel-lgtm-stack/v1.0.7/templates/chrony-node-exporter.yaml index 8c41f314..14057da7 100644 --- a/sources/otel-lgtm-stack/node-exporter/chrony-node-exporter.yaml +++ b/sources/otel-lgtm-stack/v1.0.7/templates/chrony-node-exporter.yaml @@ -2,7 +2,7 @@ apiVersion: apps/v1 kind: DaemonSet metadata: name: nodeexporter-chrony-exporter - namespace: otel-lgtm-stack + namespace: {{ .Release.Namespace }} labels: app: chrony-exporter spec: @@ -34,10 +34,10 @@ spec: name: chronyd-socket resources: limits: - cpu: 100m + cpu: "100m" memory: 128Mi requests: - cpu: 50m + cpu: "50m" memory: 64Mi volumes: - name: chronyd-socket diff --git a/sources/otel-lgtm-stack/otel-collectors/collector-manifests.yaml b/sources/otel-lgtm-stack/v1.0.7/templates/collectors-logs-metrics-k8s.yaml similarity index 64% rename from sources/otel-lgtm-stack/otel-collectors/collector-manifests.yaml rename to sources/otel-lgtm-stack/v1.0.7/templates/collectors-logs-metrics-k8s.yaml index 5ff24192..0a23f2c6 100644 --- a/sources/otel-lgtm-stack/otel-collectors/collector-manifests.yaml +++ b/sources/otel-lgtm-stack/v1.0.7/templates/collectors-logs-metrics-k8s.yaml @@ -1,217 +1,8 @@ ---- -apiVersion: v1 -kind: ServiceAccount -metadata: - name: otel-collector - namespace: otel-lgtm-stack ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: otel-collector -rules: - - apiGroups: [""] - resources: - - configmaps - - endpoints - - events - - namespaces - - namespaces/status - - nodes - - nodes/spec - - nodes/stats - - nodes/metrics - - nodes/proxy - - persistentvolumes - - persistentvolumeclaims - - pods - - pods/status - - replicationcontrollers - - replicationcontrollers/status - - resourcequotas - - services - verbs: ["get", "list", "watch"] - - apiGroups: ["extensions"] - resources: - - ingresses - verbs: ["get", "list", "watch"] - - apiGroups: [""] - resources: - - configmaps - verbs: ["get"] - - apiGroups: [""] - resources: - - namespaces - verbs: ["get", "list", "watch"] - # Added permission for non-resource URLs to access metrics endpoints - - nonResourceURLs: - - "/metrics" - - "/metrics/cadvisor" - - "/stats/summary" - - "/api/v1/nodes/*/proxy/metrics" - - "/api/v1/nodes/*/proxy/metrics/cadvisor" - verbs: ["get"] - # Added networking.k8s.io API group for newer Kubernetes versions - - apiGroups: ["networking.k8s.io"] - resources: - - ingresses - verbs: ["get", "list", "watch"] - # Added permission to access custom resource definitions if using any - - apiGroups: ["apiextensions.k8s.io"] - resources: - - customresourcedefinitions - verbs: ["get", "list", "watch"] - # Added events.k8s.io API group for newer Kubernetes events - - apiGroups: ["events.k8s.io"] - resources: - - events - verbs: ["get", "list", "watch"] - - apiGroups: ["monitoring.coreos.com"] - resources: - - servicemonitors - - podmonitors - - probes - - scrapeconfigs - verbs: ["*"] - - apiGroups: ["apps"] - resources: - - daemonsets - - deployments - - replicasets - - statefulset - verbs: ["get", "list", "watch"] - - apiGroups: ["autoscaling"] - resources: - - horizontalpodautoscalers - verbs: ["get", "list", "watch"] - - apiGroups: ["discovery.k8s.io"] - resources: - - endpointslices - verbs: ["get", "list", "watch"] ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: otel-collector -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: otel-collector -subjects: - - kind: ServiceAccount - name: otel-collector - namespace: otel-lgtm-stack ---- -# Source: openobserve-collector/templates/instrumentation-dotnet.yaml -apiVersion: opentelemetry.io/v1alpha1 -kind: Instrumentation -metadata: - name: lgtm-dotnet - namespace: otel-lgtm-stack -spec: - exporter: - endpoint: http://lgtm.lgtm-stack.svc.cluster.local:4318 ### - propagators: - - tracecontext - - baggage - sampler: - type: parentbased_traceidratio - argument: "1" - dotnet: - env: - - name: OTEL_EXPORTER_OTLP_TRACES_PROTOCOL - value: http/protobuf - - name: OTEL_EXPORTER_OTLP_METRICS_PROTOCOL - value: http/protobuf ---- -# Source: openobserve-collector/templates/instrumentation-go.yaml -apiVersion: opentelemetry.io/v1alpha1 -kind: Instrumentation -metadata: - name: lgtm-go - namespace: otel-lgtm-stack -spec: - go: - # image: ghcr.io/openobserve/opentelemetry-go-instrumentation/autoinstrumentation-go:v0.7.0-alpha-5 - image: ghcr.io/open-telemetry/opentelemetry-go-instrumentation/autoinstrumentation-go:v0.19.0-alpha - exporter: - endpoint: http://lgtm.lgtm-stack.svc.cluster.local:4318 ## - propagators: - - tracecontext - - baggage - sampler: - type: parentbased_traceidratio - argument: "1" ---- -# Source: openobserve-collector/templates/instrumentation-java.yaml -apiVersion: opentelemetry.io/v1alpha1 -kind: Instrumentation -metadata: - name: lgtm-java - namespace: otel-lgtm-stack -spec: - exporter: - endpoint: http://lgtm.lgtm-stack.svc.cluster.local:4318 ### - propagators: - - tracecontext - - baggage - sampler: - type: parentbased_traceidratio - argument: "1" - java: - env: - - name: OTEL_EXPORTER_OTLP_TRACES_PROTOCOL - value: http/protobuf - - name: OTEL_EXPORTER_OTLP_METRICS_PROTOCOL - value: http/protobuf ---- -# Source: openobserve-collector/templates/instrumentation-nodejs.yaml -apiVersion: opentelemetry.io/v1alpha1 -kind: Instrumentation -metadata: - name: lgtm-nodejs - namespace: otel-lgtm-stack -spec: - exporter: - endpoint: http://lgtm.lgtm-stack.svc.cluster.local:4318 ### - propagators: - - tracecontext - - baggage - sampler: - type: parentbased_traceidratio - argument: "1" ---- -# Source: openobserve-collector/templates/instrumentation-python.yaml -apiVersion: opentelemetry.io/v1alpha1 -kind: Instrumentation -metadata: - name: lgtm-python - namespace: otel-lgtm-stack -spec: - exporter: - endpoint: http://lgtm.lgtm-stack.svc.cluster.local:4318 ### - propagators: - - tracecontext - - baggage - sampler: - type: parentbased_traceidratio - argument: "1" - python: - env: - - name: OTEL_EXPORTER_OTLP_TRACES_PROTOCOL - value: http/protobuf - - name: OTEL_EXPORTER_OTLP_METRICS_PROTOCOL - value: http/protobuf - - name: OTEL_LOGS_EXPORTER - value: otlp_proto_http - - name: OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED - value: "false" # set to true to enable auto instrumentation for logs ---- apiVersion: opentelemetry.io/v1beta1 kind: OpenTelemetryCollector metadata: name: otel-collector-logs - namespace: otel-lgtm-stack + namespace: {{ .Release.Namespace }} spec: mode: daemonset serviceAccount: otel-collector @@ -231,11 +22,11 @@ spec: prometheus.io/port: "8888" resources: limits: - cpu: '1' - memory: 2Gi + cpu: {{ .Values.collectors.resources.logs.limits.cpu | quote }} + memory: {{ .Values.collectors.resources.logs.limits.memory }} requests: - cpu: 200m - memory: 400Mi + cpu: {{ .Values.collectors.resources.logs.requests.cpu | quote }} + memory: {{ .Values.collectors.resources.logs.requests.memory }} securityContext: #### privileged: true runAsUser: 0 #### @@ -315,7 +106,7 @@ spec: actions: - key: k8s_cluster_name action: insert - value: "cluster-name" ### value should be updated based on cluster + value: {{ .Values.cluster.name | quote }} transform: ### this adds k8s.cluster.name as a label filter error_mode: ignore log_statements: @@ -444,7 +235,7 @@ apiVersion: opentelemetry.io/v1beta1 kind: OpenTelemetryCollector metadata: name: otel-collector-logs-events - namespace: otel-lgtm-stack + namespace: {{ .Release.Namespace }} spec: image: ghcr.io/open-telemetry/opentelemetry-collector-releases/opentelemetry-collector-contrib:0.113.0 mode: deployment @@ -558,7 +349,7 @@ apiVersion: opentelemetry.io/v1beta1 kind: OpenTelemetryCollector metadata: name: otel-collector-metrics-k8s - namespace: otel-lgtm-stack + namespace: {{ .Release.Namespace }} spec: mode: deployment serviceAccount: otel-collector @@ -569,11 +360,11 @@ spec: prometheus.io/port: "8888" resources: limits: - cpu: '2' - memory: 8Gi + cpu: {{ .Values.collectors.resources.metrics.limits.cpu | quote }} + memory: {{ .Values.collectors.resources.metrics.limits.memory }} requests: - cpu: 500m - memory: 1Gi + cpu: {{ .Values.collectors.resources.metrics.requests.cpu | quote }} + memory: {{ .Values.collectors.resources.metrics.requests.memory }} config: receivers: prometheus: @@ -888,7 +679,7 @@ spec: actions: - key: k8s_cluster_name action: insert - value: "cluster-name" ### value should be updated based on cluster + value: {{ .Values.cluster.name | quote }} exporters: # Configure your actual backend exporters here @@ -904,285 +695,4 @@ spec: metrics: receivers: [prometheus] processors: [memory_limiter, batch, attributes] - exporters: [otlp] ---- -apiVersion: opentelemetry.io/v1beta1 -kind: OpenTelemetryCollector -metadata: - name: otel-collector-metrics-rest - namespace: otel-lgtm-stack -spec: - image: ghcr.io/open-telemetry/opentelemetry-collector-releases/opentelemetry-collector-contrib:0.113.0 - mode: deployment - podAnnotations: - prometheus.io/port: "8888" - prometheus.io/scrape: "true" - replicas: 1 - resources: - limits: - cpu: "2" - memory: 8Gi - requests: - cpu: 500m - memory: 1Gi - serviceAccount: otel-collector - config: - exporters: - debug: - verbosity: detailed - otlp: - endpoint: http://lgtm-stack.otel-lgtm-stack.svc.cluster.local:4317 - tls: - insecure: true - processors: - attributes: - actions: - - action: insert - key: k8s_cluster_name - value: cluster-name - batch: - send_batch_size: 2000 - timeout: 10s - memory_limiter: - check_interval: 5s - limit_percentage: 80 - spike_limit_percentage: 25 - receivers: - prometheus: - config: - scrape_configs: - - job_name: otel-collector - scrape_interval: 30s - static_configs: - - targets: - - localhost:8888 - - dns_sd_configs: - - names: - - opencost-prometheus-opencost-exporter.monitoring - port: 9003 - type: A - honor_labels: true - job_name: opencost - metrics_path: /metrics - scheme: http - scrape_interval: 1m - scrape_timeout: 10s - - job_name: gpu-operator-metrics-exporter - kubernetes_sd_configs: - - role: node - metrics_path: /metrics - relabel_configs: - - action: keep - regex: true - source_labels: - - __meta_kubernetes_node_label_feature_node_kubernetes_io_amd_gpu - - regex: (.+) - replacement: $1:32500 - source_labels: - - __meta_kubernetes_node_address_InternalIP - target_label: __address__ - - source_labels: - - __meta_kubernetes_node_name - target_label: hostname - - job_name: minio-cluster-metrics - metrics_path: /minio/v2/metrics/cluster - scheme: http - static_configs: - - targets: - - minio.minio-tenant-default.svc.cluster.local - - job_name: minio-bucket-metrics - metrics_path: /minio/v2/metrics/bucket - scheme: http - static_configs: - - targets: - - minio.minio-tenant-default.svc.cluster.local - - job_name: minio-resource-metrics - metrics_path: /minio/v2/metrics/resource - scheme: http - static_configs: - - targets: - - minio.minio-tenant-default.svc.cluster.local - - job_name: argocd-controller - metrics_path: /metrics - scheme: http - static_configs: - - targets: - - argocd-metrics.argocd.svc.cluster.local:8082 - - job_name: argocd-applicationset - metrics_path: /metrics - scheme: http - static_configs: - - targets: - - argocd-applicationset-controller.argocd.svc.cluster.local:8080 - - job_name: argocd-repo-server - metrics_path: /metrics - scheme: http - static_configs: - - targets: - - argocd-repo-server.argocd.svc.cluster.local:8084 - - job_name: longhorn - metrics_path: /metrics - scheme: http - static_configs: - - targets: - - longhorn-backend.longhorn.svc.cluster.local:9500 - service: - pipelines: - metrics: - exporters: - - otlp - processors: - - memory_limiter - - batch - - attributes - receivers: - - prometheus - ---- -apiVersion: opentelemetry.io/v1beta1 -kind: OpenTelemetryCollector -metadata: - name: otel-collector-metrics-chrony - namespace: otel-lgtm-stack -spec: - image: ghcr.io/open-telemetry/opentelemetry-collector-releases/opentelemetry-collector-contrib:0.113.0 - mode: deployment - podAnnotations: - prometheus.io/port: "8888" - prometheus.io/scrape: "true" - replicas: 1 - resources: - limits: - cpu: "2" - memory: 8Gi - requests: - cpu: 500m - memory: 1Gi - serviceAccount: otel-collector - config: - exporters: - debug: - verbosity: detailed - otlp: - endpoint: http://lgtm-stack.otel-lgtm-stack.svc.cluster.local:4317 - tls: - insecure: true - processors: - attributes: - actions: - - action: insert - key: k8s_cluster_name - value: cluster-name - batch: - send_batch_size: 2000 - timeout: 10s - memory_limiter: - check_interval: 5s - limit_percentage: 80 - spike_limit_percentage: 25 - receivers: - prometheus: - config: - scrape_configs: - - job_name: chrony-exporter - kubernetes_sd_configs: - - role: pod - relabel_configs: - - source_labels: [__meta_kubernetes_pod_label_app] - regex: chrony-exporter - action: keep - - source_labels: [__meta_kubernetes_pod_ip] - target_label: __address__ - regex: (.*) - replacement: $1:9123 - - source_labels: [__meta_kubernetes_pod_node_name] - target_label: k8s_node_name - service: - pipelines: - metrics: - exporters: - - otlp - processors: - - memory_limiter - - batch - - attributes - receivers: - - prometheus - ---- -apiVersion: opentelemetry.io/v1beta1 -kind: OpenTelemetryCollector -metadata: - name: otel-collector-metrics-airm - namespace: otel-lgtm-stack -spec: - image: ghcr.io/open-telemetry/opentelemetry-collector-releases/opentelemetry-collector-contrib:0.113.0 - mode: deployment - podAnnotations: - prometheus.io/port: "8888" - prometheus.io/scrape: "true" - replicas: 1 - resources: - limits: - cpu: "2" - memory: 8Gi - requests: - cpu: 500m - memory: 1Gi - serviceAccount: otel-collector - config: - exporters: - debug: - verbosity: detailed - otlp: - endpoint: http://lgtm-stack.otel-lgtm-stack.svc.cluster.local:4317 - tls: - insecure: true - processors: - attributes: - actions: - - action: insert - key: k8s_cluster_name - value: cluster-name - batch: - send_batch_size: 2000 - timeout: 10s - memory_limiter: - check_interval: 5s - limit_percentage: 80 - spike_limit_percentage: 25 - receivers: - prometheus: - config: - scrape_configs: - - job_name: airm-custom-metrics - scrape_interval: 30s - kubernetes_sd_configs: - - role: pod - metrics_path: / - relabel_configs: - - action: keep - source_labels: - - __meta_kubernetes_pod_label_app - regex: airm-api - - regex: (.+) - replacement: $1:9009 - source_labels: - - __meta_kubernetes_node_address_InternalIP - target_label: __address__ - action: replace - - source_labels: - - __meta_kubernetes_node_name - target_label: hostname - - service: - pipelines: - metrics: - exporters: - - otlp - processors: - - memory_limiter - - batch - - attributes - receivers: - - prometheus + exporters: [otlp] \ No newline at end of file diff --git a/sources/otel-lgtm-stack/v1.0.7/templates/collectors-metrics-rest.yaml b/sources/otel-lgtm-stack/v1.0.7/templates/collectors-metrics-rest.yaml new file mode 100644 index 00000000..78976467 --- /dev/null +++ b/sources/otel-lgtm-stack/v1.0.7/templates/collectors-metrics-rest.yaml @@ -0,0 +1,280 @@ +apiVersion: opentelemetry.io/v1beta1 +kind: OpenTelemetryCollector +metadata: + name: otel-collector-metrics-rest + namespace: {{ .Release.Namespace }} +spec: + image: ghcr.io/open-telemetry/opentelemetry-collector-releases/opentelemetry-collector-contrib:0.113.0 + mode: deployment + podAnnotations: + prometheus.io/port: "8888" + prometheus.io/scrape: "true" + replicas: 1 + resources: + limits: + cpu: {{ .Values.collectors.resources.metrics.limits.cpu | quote }} + memory: {{ .Values.collectors.resources.metrics.limits.memory }} + requests: + cpu: {{ .Values.collectors.resources.metrics.requests.cpu | quote }} + memory: {{ .Values.collectors.resources.metrics.requests.memory }} + serviceAccount: otel-collector + config: + exporters: + debug: + verbosity: detailed + otlp: + endpoint: http://lgtm-stack.otel-lgtm-stack.svc.cluster.local:4317 + tls: + insecure: true + processors: + attributes: + actions: + - action: insert + key: k8s_cluster_name + value: {{ .Values.cluster.name | quote }} + batch: + send_batch_size: 2000 + timeout: 10s + memory_limiter: + check_interval: 5s + limit_percentage: 80 + spike_limit_percentage: 25 + receivers: + prometheus: + config: + scrape_configs: + - job_name: otel-collector + scrape_interval: 30s + static_configs: + - targets: + - localhost:8888 + - dns_sd_configs: + - names: + - opencost-prometheus-opencost-exporter.monitoring + port: 9003 + type: A + honor_labels: true + job_name: opencost + metrics_path: /metrics + scheme: http + scrape_interval: 1m + scrape_timeout: 10s + - job_name: gpu-operator-metrics-exporter + kubernetes_sd_configs: + - role: node + metrics_path: /metrics + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_node_label_feature_node_kubernetes_io_amd_gpu + - regex: (.+) + replacement: $1:32500 + source_labels: + - __meta_kubernetes_node_address_InternalIP + target_label: __address__ + - source_labels: + - __meta_kubernetes_node_name + target_label: hostname + - job_name: minio-cluster-metrics + metrics_path: /minio/v2/metrics/cluster + scheme: http + static_configs: + - targets: + - minio.minio-tenant-default.svc.cluster.local + - job_name: minio-bucket-metrics + metrics_path: /minio/v2/metrics/bucket + scheme: http + static_configs: + - targets: + - minio.minio-tenant-default.svc.cluster.local + - job_name: minio-resource-metrics + metrics_path: /minio/v2/metrics/resource + scheme: http + static_configs: + - targets: + - minio.minio-tenant-default.svc.cluster.local + - job_name: argocd-controller + metrics_path: /metrics + scheme: http + static_configs: + - targets: + - argocd-metrics.argocd.svc.cluster.local:8082 + - job_name: argocd-applicationset + metrics_path: /metrics + scheme: http + static_configs: + - targets: + - argocd-applicationset-controller.argocd.svc.cluster.local:8080 + - job_name: argocd-repo-server + metrics_path: /metrics + scheme: http + static_configs: + - targets: + - argocd-repo-server.argocd.svc.cluster.local:8084 + - job_name: longhorn + metrics_path: /metrics + scheme: http + static_configs: + - targets: + - longhorn-backend.longhorn.svc.cluster.local:9500 + service: + pipelines: + metrics: + exporters: + - otlp + processors: + - memory_limiter + - batch + - attributes + receivers: + - prometheus + +--- +apiVersion: opentelemetry.io/v1beta1 +kind: OpenTelemetryCollector +metadata: + name: otel-collector-metrics-chrony + namespace: {{ .Release.Namespace }} +spec: + image: ghcr.io/open-telemetry/opentelemetry-collector-releases/opentelemetry-collector-contrib:0.113.0 + mode: deployment + podAnnotations: + prometheus.io/port: "8888" + prometheus.io/scrape: "true" + replicas: 1 + resources: + limits: + cpu: {{ .Values.collectors.resources.metrics.limits.cpu | quote }} + memory: {{ .Values.collectors.resources.metrics.limits.memory }} + requests: + cpu: {{ .Values.collectors.resources.metrics.requests.cpu | quote }} + memory: {{ .Values.collectors.resources.metrics.requests.memory }} + serviceAccount: otel-collector + config: + exporters: + debug: + verbosity: detailed + otlp: + endpoint: http://lgtm-stack.otel-lgtm-stack.svc.cluster.local:4317 + tls: + insecure: true + processors: + attributes: + actions: + - action: insert + key: k8s_cluster_name + value: {{ .Values.cluster.name | quote }} + batch: + send_batch_size: 2000 + timeout: 10s + memory_limiter: + check_interval: 5s + limit_percentage: 80 + spike_limit_percentage: 25 + receivers: + prometheus: + config: + scrape_configs: + - job_name: chrony-exporter + kubernetes_sd_configs: + - role: pod + relabel_configs: + - source_labels: [__meta_kubernetes_pod_label_app] + regex: chrony-exporter + action: keep + - source_labels: [__meta_kubernetes_pod_ip] + target_label: __address__ + regex: (.*) + replacement: $1:9123 + - source_labels: [__meta_kubernetes_pod_node_name] + target_label: k8s_node_name + service: + pipelines: + metrics: + exporters: + - otlp + processors: + - memory_limiter + - batch + - attributes + receivers: + - prometheus + +--- +apiVersion: opentelemetry.io/v1beta1 +kind: OpenTelemetryCollector +metadata: + name: otel-collector-metrics-airm + namespace: {{ .Release.Namespace }} +spec: + image: ghcr.io/open-telemetry/opentelemetry-collector-releases/opentelemetry-collector-contrib:0.113.0 + mode: deployment + podAnnotations: + prometheus.io/port: "8888" + prometheus.io/scrape: "true" + replicas: 1 + resources: + limits: + cpu: {{ .Values.collectors.resources.metrics.limits.cpu | quote }} + memory: {{ .Values.collectors.resources.metrics.limits.memory }} + requests: + cpu: {{ .Values.collectors.resources.metrics.requests.cpu | quote }} + memory: {{ .Values.collectors.resources.metrics.requests.memory }} + serviceAccount: otel-collector + config: + exporters: + debug: + verbosity: detailed + otlp: + endpoint: http://lgtm-stack.otel-lgtm-stack.svc.cluster.local:4317 + tls: + insecure: true + processors: + attributes: + actions: + - action: insert + key: k8s_cluster_name + value: {{ .Values.cluster.name | quote }} + batch: + send_batch_size: 2000 + timeout: 10s + memory_limiter: + check_interval: 5s + limit_percentage: 80 + spike_limit_percentage: 25 + receivers: + prometheus: + config: + scrape_configs: + - job_name: airm-custom-metrics + scrape_interval: 30s + kubernetes_sd_configs: + - role: pod + metrics_path: / + relabel_configs: + - action: keep + source_labels: + - __meta_kubernetes_pod_label_app + regex: airm-api + - regex: (.+) + replacement: $1:9009 + source_labels: + - __meta_kubernetes_node_address_InternalIP + target_label: __address__ + action: replace + - source_labels: + - __meta_kubernetes_node_name + target_label: hostname + + service: + pipelines: + metrics: + exporters: + - otlp + processors: + - memory_limiter + - batch + - attributes + receivers: + - prometheus \ No newline at end of file diff --git a/sources/otel-lgtm-stack/v1.0.7/templates/collectors-rbac-instrumentation.yaml b/sources/otel-lgtm-stack/v1.0.7/templates/collectors-rbac-instrumentation.yaml new file mode 100644 index 00000000..3701e07d --- /dev/null +++ b/sources/otel-lgtm-stack/v1.0.7/templates/collectors-rbac-instrumentation.yaml @@ -0,0 +1,208 @@ +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: otel-collector + namespace: {{ .Release.Namespace }} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: otel-collector +rules: + - apiGroups: [""] + resources: + - configmaps + - endpoints + - events + - namespaces + - namespaces/status + - nodes + - nodes/spec + - nodes/stats + - nodes/metrics + - nodes/proxy + - persistentvolumes + - persistentvolumeclaims + - pods + - pods/status + - replicationcontrollers + - replicationcontrollers/status + - resourcequotas + - services + verbs: ["get", "list", "watch"] + - apiGroups: ["extensions"] + resources: + - ingresses + verbs: ["get", "list", "watch"] + - apiGroups: [""] + resources: + - configmaps + verbs: ["get"] + - apiGroups: [""] + resources: + - namespaces + verbs: ["get", "list", "watch"] + # Added permission for non-resource URLs to access metrics endpoints + - nonResourceURLs: + - "/metrics" + - "/metrics/cadvisor" + - "/stats/summary" + - "/api/v1/nodes/*/proxy/metrics" + - "/api/v1/nodes/*/proxy/metrics/cadvisor" + verbs: ["get"] + # Added networking.k8s.io API group for newer Kubernetes versions + - apiGroups: ["networking.k8s.io"] + resources: + - ingresses + verbs: ["get", "list", "watch"] + # Added permission to access custom resource definitions if using any + - apiGroups: ["apiextensions.k8s.io"] + resources: + - customresourcedefinitions + verbs: ["get", "list", "watch"] + # Added events.k8s.io API group for newer Kubernetes events + - apiGroups: ["events.k8s.io"] + resources: + - events + verbs: ["get", "list", "watch"] + - apiGroups: ["monitoring.coreos.com"] + resources: + - servicemonitors + - podmonitors + - probes + - scrapeconfigs + verbs: ["*"] + - apiGroups: ["apps"] + resources: + - daemonsets + - deployments + - replicasets + - statefulset + verbs: ["get", "list", "watch"] + - apiGroups: ["autoscaling"] + resources: + - horizontalpodautoscalers + verbs: ["get", "list", "watch"] + - apiGroups: ["discovery.k8s.io"] + resources: + - endpointslices + verbs: ["get", "list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: otel-collector +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: otel-collector +subjects: + - kind: ServiceAccount + name: otel-collector + namespace: {{ .Release.Namespace }} +--- +# Source: openobserve-collector/templates/instrumentation-dotnet.yaml +apiVersion: opentelemetry.io/v1alpha1 +kind: Instrumentation +metadata: + name: lgtm-dotnet + namespace: {{ .Release.Namespace }} +spec: + exporter: + endpoint: http://lgtm.lgtm-stack.svc.cluster.local:4318 ### + propagators: + - tracecontext + - baggage + sampler: + type: parentbased_traceidratio + argument: "1" + dotnet: + env: + - name: OTEL_EXPORTER_OTLP_TRACES_PROTOCOL + value: http/protobuf + - name: OTEL_EXPORTER_OTLP_METRICS_PROTOCOL + value: http/protobuf +--- +# Source: openobserve-collector/templates/instrumentation-go.yaml +apiVersion: opentelemetry.io/v1alpha1 +kind: Instrumentation +metadata: + name: lgtm-go + namespace: {{ .Release.Namespace }} +spec: + go: + # image: ghcr.io/openobserve/opentelemetry-go-instrumentation/autoinstrumentation-go:v0.7.0-alpha-5 + image: ghcr.io/open-telemetry/opentelemetry-go-instrumentation/autoinstrumentation-go:v0.19.0-alpha + exporter: + endpoint: http://lgtm.lgtm-stack.svc.cluster.local:4318 ## + propagators: + - tracecontext + - baggage + sampler: + type: parentbased_traceidratio + argument: "1" +--- +# Source: openobserve-collector/templates/instrumentation-java.yaml +apiVersion: opentelemetry.io/v1alpha1 +kind: Instrumentation +metadata: + name: lgtm-java + namespace: {{ .Release.Namespace }} +spec: + exporter: + endpoint: http://lgtm.lgtm-stack.svc.cluster.local:4318 ### + propagators: + - tracecontext + - baggage + sampler: + type: parentbased_traceidratio + argument: "1" + java: + env: + - name: OTEL_EXPORTER_OTLP_TRACES_PROTOCOL + value: http/protobuf + - name: OTEL_EXPORTER_OTLP_METRICS_PROTOCOL + value: http/protobuf +--- +# Source: openobserve-collector/templates/instrumentation-nodejs.yaml +apiVersion: opentelemetry.io/v1alpha1 +kind: Instrumentation +metadata: + name: lgtm-nodejs + namespace: {{ .Release.Namespace }} +spec: + exporter: + endpoint: http://lgtm.lgtm-stack.svc.cluster.local:4318 ### + propagators: + - tracecontext + - baggage + sampler: + type: parentbased_traceidratio + argument: "1" +--- +# Source: openobserve-collector/templates/instrumentation-python.yaml +apiVersion: opentelemetry.io/v1alpha1 +kind: Instrumentation +metadata: + name: lgtm-python + namespace: {{ .Release.Namespace }} +spec: + exporter: + endpoint: http://lgtm.lgtm-stack.svc.cluster.local:4318 ### + propagators: + - tracecontext + - baggage + sampler: + type: parentbased_traceidratio + argument: "1" + python: + env: + - name: OTEL_EXPORTER_OTLP_TRACES_PROTOCOL + value: http/protobuf + - name: OTEL_EXPORTER_OTLP_METRICS_PROTOCOL + value: http/protobuf + - name: OTEL_LOGS_EXPORTER + value: otlp_proto_http + - name: OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED + value: "false" # set to true to enable auto instrumentation for logs \ No newline at end of file diff --git a/sources/otel-lgtm-stack/dashboards/lgtm-default-dashboards.yaml b/sources/otel-lgtm-stack/v1.0.7/templates/dashboards-default.yaml similarity index 99% rename from sources/otel-lgtm-stack/dashboards/lgtm-default-dashboards.yaml rename to sources/otel-lgtm-stack/v1.0.7/templates/dashboards-default.yaml index a93ea3da..ae4d95ee 100644 --- a/sources/otel-lgtm-stack/dashboards/lgtm-default-dashboards.yaml +++ b/sources/otel-lgtm-stack/v1.0.7/templates/dashboards-default.yaml @@ -3,7 +3,7 @@ apiVersion: v1 kind: ConfigMap metadata: name: lgtm-k8s-nodes-overview - namespace: otel-lgtm-stack + namespace: {{ .Release.Namespace }} labels: grafana_dashboard: "1" annotations: @@ -2081,7 +2081,7 @@ data: "hide": false, "instant": false, "interval": "", - "legendFormat": "{{node}}", + "legendFormat": "{{ "{{" }}node{{ "}}" }}", "refId": "I" } ], @@ -2184,7 +2184,7 @@ data: "hide": false, "instant": false, "interval": "", - "legendFormat": "{{node}}", + "legendFormat": "{{ "{{" }}node{{ "}}" }}", "refId": "I" } ], @@ -2287,7 +2287,7 @@ data: "format": "time_series", "interval": "", "intervalFactor": 1, - "legendFormat": " update :{{node}}", + "legendFormat": " update :{{ "{{" }}node{{ "}}" }}", "metric": "network", "refId": "A", "step": 10 @@ -2302,7 +2302,7 @@ data: "instant": false, "interval": "", "intervalFactor": 1, - "legendFormat": " restart :{{node}}", + "legendFormat": " restart :{{ "{{" }}node{{ "}}" }}", "metric": "network", "refId": "B", "step": 10 @@ -2989,7 +2989,7 @@ data: "instant": false, "interval": "", "intervalFactor": 1, - "legendFormat": "{{ container}}", + "legendFormat": "{{ "{{" }} container{{ "}}" }}", "metric": "container_cpu", "refId": "A", "step": 10 @@ -3097,7 +3097,7 @@ data: "expr": "sum (container_memory_working_set_bytes{origin_prometheus=~\"$origin_prometheus\",container =~\"$Container\",container !=\"\",container!=\"POD\",namespace=~\"$NameSpace\"}) by (container)/ sum(container_spec_memory_limit_bytes{origin_prometheus=~\"$origin_prometheus\",container =~\"$Container\",container !=\"\",container!=\"POD\",namespace=~\"$NameSpace\"}) by (container) * 100", "interval": "", "intervalFactor": 1, - "legendFormat": "WSS:{{ container }}", + "legendFormat": "WSS:{{ "{{" }} container{{ "}}" }}", "metric": "container_memory_usage:sort_desc", "refId": "A", "step": 10 @@ -3110,7 +3110,7 @@ data: "expr": "sum (container_memory_rss{origin_prometheus=~\"$origin_prometheus\",container =~\"$Container\",container !=\"\",container!=\"POD\",namespace=~\"$NameSpace\"}) by (container)/ sum(container_spec_memory_limit_bytes{origin_prometheus=~\"$origin_prometheus\",container =~\"$Container\",container !=\"\",container!=\"POD\",namespace=~\"$NameSpace\"}) by (container) * 100", "interval": "", "intervalFactor": 1, - "legendFormat": "RSS:{{ container }}", + "legendFormat": "RSS:{{ "{{" }} container{{ "}}" }}", "metric": "container_memory_usage:sort_desc", "refId": "B", "step": 10 @@ -3222,7 +3222,7 @@ data: "hide": false, "interval": "", "intervalFactor": 1, - "legendFormat": " update :{{ container }}", + "legendFormat": " update :{{ "{{" }} container{{ "}}" }}", "metric": "network", "refId": "A", "step": 10 @@ -3236,7 +3236,7 @@ data: "expr": "sum(sum(irate(container_network_transmit_bytes_total{origin_prometheus=~\"$origin_prometheus\",image!=\"\",name=~\"^k8s_.*\",node=~\"^$Node$\",namespace=~\"$NameSpace\",pod=~\".*$Container.*\"}[2m])) by (pod)* on(pod) group_right kube_pod_container_info) by(container) *8", "interval": "", "intervalFactor": 1, - "legendFormat": " restart :{{ container }}", + "legendFormat": " restart :{{ "{{" }} container{{ "}}" }}", "metric": "network", "refId": "B", "step": 10 @@ -3250,7 +3250,7 @@ data: "hide": true, "interval": "", "intervalFactor": 1, - "legendFormat": "-> {{ pod }}", + "legendFormat": "-> {{ "{{" }} pod{{ "}}" }}", "metric": "network", "refId": "C", "step": 10 @@ -3264,7 +3264,7 @@ data: "hide": true, "interval": "", "intervalFactor": 1, - "legendFormat": "<- {{ pod }}", + "legendFormat": "<- {{ "{{" }} pod{{ "}}" }}", "metric": "network", "refId": "D", "step": 10 @@ -4171,7 +4171,7 @@ data: "instant": false, "interval": "", "intervalFactor": 1, - "legendFormat": "{{ pod }}", + "legendFormat": "{{ "{{" }} pod{{ "}}" }}", "metric": "container_cpu", "refId": "A", "step": 10 @@ -4280,7 +4280,7 @@ data: "expr": "sum (container_memory_working_set_bytes{origin_prometheus=~\"$origin_prometheus\",pod=~\"$Pod\",container =~\"$Container\",container !=\"\",container!=\"POD\",node=~\"^$Node$\",namespace=~\"$NameSpace\"}) by (container, pod)/ sum(container_spec_memory_limit_bytes{origin_prometheus=~\"$origin_prometheus\",pod=~\"$Pod\",container =~\"$Container\",container !=\"\",container!=\"POD\",node=~\"^$Node$\",namespace=~\"$NameSpace\"}) by (container, pod) * 100", "interval": "", "intervalFactor": 1, - "legendFormat": "WSS:{{ pod }}", + "legendFormat": "WSS:{{ "{{" }} pod{{ "}}" }}", "metric": "container_memory_usage:sort_desc", "refId": "A", "step": 10 @@ -4294,7 +4294,7 @@ data: "expr": "sum (container_memory_rss{origin_prometheus=~\"$origin_prometheus\",pod=~\"$Pod\",container =~\"$Container\",container !=\"\",container!=\"POD\",node=~\"^$Node$\",namespace=~\"$NameSpace\"}) by (container, pod)/ sum(container_spec_memory_limit_bytes{origin_prometheus=~\"$origin_prometheus\",pod=~\"$Pod\",container =~\"$Container\",container !=\"\",container!=\"POD\",node=~\"^$Node$\",namespace=~\"$NameSpace\"}) by (container, pod) * 100", "interval": "", "intervalFactor": 1, - "legendFormat": "RSS:{{ pod }}", + "legendFormat": "RSS:{{ "{{" }} pod{{ "}}" }}", "metric": "container_memory_usage:sort_desc", "refId": "B", "step": 10 @@ -4309,7 +4309,7 @@ data: "instant": false, "interval": "", "intervalFactor": 1, - "legendFormat": "Heap:{{ pod }}", + "legendFormat": "Heap:{{ "{{" }} pod{{ "}}" }}", "metric": "container_memory_usage:sort_desc", "refId": "C", "step": 10 @@ -4422,7 +4422,7 @@ data: "hide": false, "interval": "", "intervalFactor": 1, - "legendFormat": " update :{{ pod}}", + "legendFormat": " update :{{ "{{" }} pod{{ "}}" }}", "metric": "network", "refId": "A", "step": 10 @@ -4436,7 +4436,7 @@ data: "expr": "sum(sum(irate(container_network_transmit_bytes_total{origin_prometheus=~\"$origin_prometheus\",pod=~\"$Pod\",image!=\"\",name=~\"^k8s_.*\",node=~\"^$Node$\",namespace=~\"$NameSpace\",pod=~\".*$Container.*\"}[2m])) by (pod)* on(pod) group_right kube_pod_container_info) by(pod) *8", "interval": "", "intervalFactor": 1, - "legendFormat": " restart :{{ pod}}", + "legendFormat": " restart :{{ "{{" }} pod{{ "}}" }}", "metric": "network", "refId": "B", "step": 10 @@ -4450,7 +4450,7 @@ data: "hide": true, "interval": "", "intervalFactor": 1, - "legendFormat": "-> {{ pod }}", + "legendFormat": "-> {{ "{{" }} pod{{ "}}" }}", "metric": "network", "refId": "C", "step": 10 @@ -4464,7 +4464,7 @@ data: "hide": true, "interval": "", "intervalFactor": 1, - "legendFormat": "<- {{ pod }}", + "legendFormat": "<- {{ "{{" }} pod{{ "}}" }}", "metric": "network", "refId": "D", "step": 10 @@ -4724,7 +4724,7 @@ apiVersion: v1 kind: ConfigMap metadata: name: lgtm-k8s-volume-information - namespace: otel-lgtm-stack + namespace: {{ .Release.Namespace }} labels: grafana_dashboard: "1" annotations: @@ -5368,7 +5368,7 @@ data: "expr": "(\n (\n ((kubelet_volume_stats_used_bytes {namespace=~\"(openshift-.*|kube-.*|default|logging)\"} ) * 0) + 1\n and\n (predict_linear(kubelet_volume_stats_available_bytes {namespace=~\"(openshift-.*|kube-.*|default|logging)\"}[1d], 7 * 24 * 60 * 60) < 0)\n )\n)", "instant": false, "intervalFactor": 1, - "legendFormat": "{{namespace}} ({{persistentvolumeclaim}})", + "legendFormat": "{{ "{{" }}namespace{{ "}}" }} ({{ "{{" }}persistentvolumeclaim{{ "}}" }})", "refId": "A" } ], @@ -5472,7 +5472,7 @@ data: "hide": false, "instant": false, "intervalFactor": 1, - "legendFormat": "{{namespace}} ({{persistentvolumeclaim}})", + "legendFormat": "{{ "{{" }}namespace{{ "}}" }} ({{ "{{" }}persistentvolumeclaim{{ "}}" }})", "refId": "A" } ], @@ -5707,7 +5707,7 @@ data: "expr": "(\n (\n (\n (\n (kubelet_volume_stats_used_bytes {namespace!~\"(openshift-.*|kube-.*|default|logging)\"})\n )\n * 0 + 1\n )\n and\n (predict_linear(kubelet_volume_stats_available_bytes[1d], 7 * 24 * 60 * 60) < 0)\n )\n)", "instant": false, "intervalFactor": 1, - "legendFormat": "{{namespace}} ({{persistentvolumeclaim}})", + "legendFormat": "{{ "{{" }}namespace{{ "}}" }} ({{ "{{" }}persistentvolumeclaim{{ "}}" }})", "refId": "A" } ], @@ -5811,7 +5811,7 @@ data: "hide": false, "instant": false, "intervalFactor": 1, - "legendFormat": "{{namespace}} ({{persistentvolumeclaim}})", + "legendFormat": "{{ "{{" }}namespace{{ "}}" }} ({{ "{{" }}persistentvolumeclaim{{ "}}" }})", "refId": "A" } ], @@ -6547,7 +6547,7 @@ data: "expr": "(max by (persistentvolumeclaim,namespace) (kubelet_volume_stats_used_bytes ))", "instant": false, "intervalFactor": 1, - "legendFormat": "{{namespace}} ({{persistentvolumeclaim}})", + "legendFormat": "{{ "{{" }}namespace{{ "}}" }} ({{ "{{" }}persistentvolumeclaim{{ "}}" }})", "refId": "A" } ], @@ -6640,7 +6640,7 @@ data: { "expr": "(max by (persistentvolumeclaim,namespace) (kubelet_volume_stats_used_bytes )) / (max by (persistentvolumeclaim,namespace) (kubelet_volume_stats_capacity_bytes )) * 100", "intervalFactor": 1, - "legendFormat": "{{namespace}} ({{persistentvolumeclaim}})", + "legendFormat": "{{ "{{" }}namespace{{ "}}" }} ({{ "{{" }}persistentvolumeclaim{{ "}}" }})", "refId": "A" } ], @@ -6746,7 +6746,7 @@ data: "format": "time_series", "instant": false, "intervalFactor": 1, - "legendFormat": "{{namespace}} ({{persistentvolumeclaim}})", + "legendFormat": "{{ "{{" }}namespace{{ "}}" }} ({{ "{{" }}persistentvolumeclaim{{ "}}" }})", "refId": "A" } ], @@ -6840,7 +6840,7 @@ data: "format": "time_series", "instant": false, "intervalFactor": 1, - "legendFormat": "{{namespace}} ({{persistentvolumeclaim}})", + "legendFormat": "{{ "{{" }}namespace{{ "}}" }} ({{ "{{" }}persistentvolumeclaim{{ "}}" }})", "refId": "A" } ], @@ -6936,7 +6936,7 @@ data: "format": "time_series", "instant": false, "intervalFactor": 1, - "legendFormat": "{{namespace}} ({{persistentvolumeclaim}})", + "legendFormat": "{{ "{{" }}namespace{{ "}}" }} ({{ "{{" }}persistentvolumeclaim{{ "}}" }})", "refId": "A" } ], diff --git a/sources/otel-lgtm-stack/dashboards/lgtm-gpu-metrics-dashboard.yaml b/sources/otel-lgtm-stack/v1.0.7/templates/dashboards-gpu.yaml similarity index 99% rename from sources/otel-lgtm-stack/dashboards/lgtm-gpu-metrics-dashboard.yaml rename to sources/otel-lgtm-stack/v1.0.7/templates/dashboards-gpu.yaml index 30afe61c..5f3d3d02 100644 --- a/sources/otel-lgtm-stack/dashboards/lgtm-gpu-metrics-dashboard.yaml +++ b/sources/otel-lgtm-stack/v1.0.7/templates/dashboards-gpu.yaml @@ -3,7 +3,7 @@ apiVersion: v1 kind: ConfigMap metadata: name: lgtm-amd-gpu-dashboard - namespace: otel-lgtm-stack + namespace: {{ .Release.Namespace }} labels: grafana_dashboard: "1" annotations: @@ -611,7 +611,7 @@ data: "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, - "legendFormat": "{{gpu_uuid}}", + "legendFormat": "{{ "{{" }}gpu_uuid{{ "}}" }}", "range": true, "refId": "gpu_gfx_activity", "useBackend": false @@ -818,7 +818,7 @@ data: "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, - "legendFormat": "{{gpu_uuid}}", + "legendFormat": "{{ "{{" }}gpu_uuid{{ "}}" }}", "range": true, "refId": "gpu_edge_temperature", "useBackend": false diff --git a/sources/otel-lgtm-stack/dashboards/lgtm-minio-dashboard.yaml b/sources/otel-lgtm-stack/v1.0.7/templates/dashboards-minio.yaml similarity index 98% rename from sources/otel-lgtm-stack/dashboards/lgtm-minio-dashboard.yaml rename to sources/otel-lgtm-stack/v1.0.7/templates/dashboards-minio.yaml index 9cf0509b..f2957871 100644 --- a/sources/otel-lgtm-stack/dashboards/lgtm-minio-dashboard.yaml +++ b/sources/otel-lgtm-stack/v1.0.7/templates/dashboards-minio.yaml @@ -3,7 +3,7 @@ apiVersion: v1 kind: ConfigMap metadata: name: lgtm-minio-dashboard - namespace: otel-lgtm-stack + namespace: {{ .Release.Namespace }} labels: grafana_dashboard: "1" annotations: @@ -305,7 +305,7 @@ data: "editorMode": "code", "expr": "minio_cluster_bucket_total{job=\"minio-cluster-metrics\"}", "instant": false, - "legendFormat": "{{job}}", + "legendFormat": "{{ "{{" }}job{{ "}}" }}", "range": true, "refId": "A" } @@ -369,7 +369,7 @@ data: "editorMode": "code", "expr": "minio_bucket_usage_total_bytes{}", "instant": false, - "legendFormat": "{{bucket}}", + "legendFormat": "{{ "{{" }}bucket{{ "}}" }}", "range": true, "refId": "A" } @@ -492,7 +492,7 @@ data: "editorMode": "code", "expr": "increase(minio_bucket_usage_total_bytes[10m])\n", "instant": false, - "legendFormat": "{{bucket}}", + "legendFormat": "{{ "{{" }}bucket{{ "}}" }}", "range": true, "refId": "A" } diff --git a/sources/otel-lgtm-stack/kube-state-metrics/kube-state-metrics-manifests.yaml b/sources/otel-lgtm-stack/v1.0.7/templates/kube-state-metrics.yaml similarity index 96% rename from sources/otel-lgtm-stack/kube-state-metrics/kube-state-metrics-manifests.yaml rename to sources/otel-lgtm-stack/v1.0.7/templates/kube-state-metrics.yaml index d481f9dc..514d416a 100644 --- a/sources/otel-lgtm-stack/kube-state-metrics/kube-state-metrics-manifests.yaml +++ b/sources/otel-lgtm-stack/v1.0.7/templates/kube-state-metrics.yaml @@ -13,7 +13,7 @@ metadata: app.kubernetes.io/instance: my-kube-state-metrics app.kubernetes.io/version: "2.15.0" name: my-kube-state-metrics - namespace: otel-lgtm-stack + namespace: {{ .Release.Namespace }} --- # Source: kube-state-metrics/templates/role.yaml apiVersion: rbac.authorization.k8s.io/v1 @@ -190,14 +190,14 @@ roleRef: subjects: - kind: ServiceAccount name: my-kube-state-metrics - namespace: otel-lgtm-stack + namespace: {{ .Release.Namespace }} --- # Source: kube-state-metrics/templates/service.yaml apiVersion: v1 kind: Service metadata: name: my-kube-state-metrics - namespace: otel-lgtm-stack + namespace: {{ .Release.Namespace }} labels: helm.sh/chart: kube-state-metrics-5.30.0 app.kubernetes.io/managed-by: Helm @@ -213,8 +213,8 @@ spec: ports: - name: "http" protocol: TCP - port: 8080 - targetPort: 8080 + port: {{ .Values.services.kubeStateMetrics.http }} + targetPort: {{ .Values.services.kubeStateMetrics.http }} selector: app.kubernetes.io/name: kube-state-metrics @@ -225,7 +225,7 @@ apiVersion: apps/v1 kind: Deployment metadata: name: my-kube-state-metrics - namespace: otel-lgtm-stack + namespace: {{ .Release.Namespace }} labels: helm.sh/chart: kube-state-metrics-5.30.0 app.kubernetes.io/managed-by: Helm diff --git a/sources/otel-lgtm-stack/otel-lgtm/modified-manifests.yaml b/sources/otel-lgtm-stack/v1.0.7/templates/lgtm-stack.yaml similarity index 83% rename from sources/otel-lgtm-stack/otel-lgtm/modified-manifests.yaml rename to sources/otel-lgtm-stack/v1.0.7/templates/lgtm-stack.yaml index f1ee5508..f42e8c26 100644 --- a/sources/otel-lgtm-stack/otel-lgtm/modified-manifests.yaml +++ b/sources/otel-lgtm-stack/v1.0.7/templates/lgtm-stack.yaml @@ -4,79 +4,79 @@ kind: Namespace metadata: labels: pod-security.kubernetes.io/enforce: privileged - name: otel-lgtm-stack + name: {{ .Release.Namespace }} --- apiVersion: v1 kind: PersistentVolumeClaim metadata: name: tempo-pvc - namespace: otel-lgtm-stack + namespace: {{ .Release.Namespace }} spec: storageClassName: default accessModes: - ReadWriteOnce resources: requests: - storage: 50Gi + storage: {{ .Values.lgtm.storage.tempo }} --- apiVersion: v1 kind: PersistentVolumeClaim metadata: name: loki-data-pvc - namespace: otel-lgtm-stack + namespace: {{ .Release.Namespace }} spec: storageClassName: default accessModes: - ReadWriteOnce resources: requests: - storage: 50Gi + storage: {{ .Values.lgtm.storage.loki }} --- apiVersion: v1 kind: PersistentVolumeClaim metadata: name: loki-storage-pvc - namespace: otel-lgtm-stack + namespace: {{ .Release.Namespace }} spec: storageClassName: default accessModes: - ReadWriteOnce resources: requests: - storage: 50Gi + storage: {{ .Values.lgtm.storage.extra }} --- apiVersion: v1 kind: PersistentVolumeClaim metadata: name: grafana-pvc - namespace: otel-lgtm-stack + namespace: {{ .Release.Namespace }} spec: storageClassName: default accessModes: - ReadWriteOnce resources: requests: - storage: 10Gi + storage: {{ .Values.lgtm.storage.grafana }} --- apiVersion: v1 kind: PersistentVolumeClaim metadata: name: p8s-pvc - namespace: otel-lgtm-stack + namespace: {{ .Release.Namespace }} spec: storageClassName: default accessModes: - ReadWriteOnce resources: requests: - storage: 50Gi + storage: {{ .Values.lgtm.storage.mimir }} --- # Source: grafana/templates/serviceaccount.yaml apiVersion: v1 kind: ServiceAccount metadata: name: grafana-sidecar - namespace: otel-lgtm-stack + namespace: {{ .Release.Namespace }} --- # Source: grafana/templates/configmap-dashboard-provider.yaml ###### apiVersion: v1 @@ -84,7 +84,7 @@ kind: ConfigMap metadata: labels: name: grafana-config-dashboards - namespace: otel-lgtm-stack + namespace: {{ .Release.Namespace }} data: provider.yaml: |- apiVersion: 1 @@ -117,7 +117,7 @@ metadata: subjects: - kind: ServiceAccount name: grafana-sidecar - namespace: otel-lgtm-stack + namespace: {{ .Release.Namespace }} roleRef: kind: ClusterRole name: grafana-sidecar-clusterrole @@ -128,37 +128,37 @@ apiVersion: v1 kind: Service metadata: name: lgtm-stack - namespace: otel-lgtm-stack + namespace: {{ .Release.Namespace }} spec: selector: app: lgtm ports: - name: grafana protocol: TCP - port: 3000 - targetPort: 3000 + port: {{ .Values.services.lgtm.grafana }} + targetPort: {{ .Values.services.lgtm.grafana }} - name: otel-grpc protocol: TCP - port: 4317 - targetPort: 4317 + port: {{ .Values.services.lgtm.otelGrpc }} + targetPort: {{ .Values.services.lgtm.otelGrpc }} - name: otel-http protocol: TCP - port: 4318 - targetPort: 4318 + port: {{ .Values.services.lgtm.otelHttp }} + targetPort: {{ .Values.services.lgtm.otelHttp }} - name: prometheus protocol: TCP - port: 9090 - targetPort: 9090 + port: {{ .Values.services.lgtm.prometheus }} + targetPort: {{ .Values.services.lgtm.prometheus }} - name: loki protocol: TCP - port: 3100 - targetPort: 3100 + port: {{ .Values.services.lgtm.loki }} + targetPort: {{ .Values.services.lgtm.loki }} --- apiVersion: apps/v1 kind: Deployment metadata: name: lgtm - namespace: otel-lgtm-stack + namespace: {{ .Release.Namespace }} spec: replicas: 1 strategy: @@ -242,10 +242,10 @@ spec: failureThreshold: 3 resources: requests: - cpu: "500m" - memory: "1024Mi" + cpu: {{ .Values.lgtm.resources.requests.cpu | quote }} + memory: {{ .Values.lgtm.resources.requests.memory | quote }} limits: - memory: "8Gi" + memory: {{ .Values.lgtm.resources.limits.memory | quote }} # NOTE: By default OpenShift does not allow writing the root directory. # Thats why the data dirs for grafana, prometheus and loki can not be # created and the pod never becomes ready. diff --git a/sources/otel-lgtm-stack/node-exporter/node-exporter-manifests.yaml b/sources/otel-lgtm-stack/v1.0.7/templates/node-exporter.yaml similarity index 95% rename from sources/otel-lgtm-stack/node-exporter/node-exporter-manifests.yaml rename to sources/otel-lgtm-stack/v1.0.7/templates/node-exporter.yaml index 5ad37339..42559d1d 100644 --- a/sources/otel-lgtm-stack/node-exporter/node-exporter-manifests.yaml +++ b/sources/otel-lgtm-stack/v1.0.7/templates/node-exporter.yaml @@ -4,7 +4,7 @@ apiVersion: v1 kind: ServiceAccount metadata: name: nodeexporter-prometheus-node-exporter - namespace: otel-lgtm-stack + namespace: {{ .Release.Namespace }} labels: helm.sh/chart: prometheus-node-exporter-4.44.1 app.kubernetes.io/managed-by: Helm @@ -20,7 +20,7 @@ apiVersion: v1 kind: Service metadata: name: nodeexporter-prometheus-node-exporter - namespace: otel-lgtm-stack + namespace: {{ .Release.Namespace }} labels: helm.sh/chart: prometheus-node-exporter-4.44.1 app.kubernetes.io/managed-by: Helm @@ -34,8 +34,8 @@ metadata: spec: type: ClusterIP ports: - - port: 9100 - targetPort: 9100 + - port: {{ .Values.services.nodeExporter.metrics }} + targetPort: {{ .Values.services.nodeExporter.metrics }} protocol: TCP name: metrics selector: @@ -47,7 +47,7 @@ apiVersion: apps/v1 kind: DaemonSet metadata: name: nodeexporter-prometheus-node-exporter - namespace: otel-lgtm-stack + namespace: {{ .Release.Namespace }} labels: helm.sh/chart: prometheus-node-exporter-4.44.1 app.kubernetes.io/managed-by: Helm diff --git a/sources/otel-lgtm-stack/v1.0.7/values.yaml b/sources/otel-lgtm-stack/v1.0.7/values.yaml new file mode 100644 index 00000000..6ea2fd2e --- /dev/null +++ b/sources/otel-lgtm-stack/v1.0.7/values.yaml @@ -0,0 +1,77 @@ +# OpenTelemetry LGTM Stack Configuration +# Phase 1: Essential Infrastructure Parameters + +# Cluster identification +cluster: + name: "cluster-name" # Update this to your actual cluster name + +# Namespace configuration +namespace: otel-lgtm-stack + +# LGTM Stack storage configuration +lgtm: + storage: + # Tempo storage for traces + tempo: 50Gi + # Loki storage for logs + loki: 50Gi + # Grafana storage for dashboards/config + grafana: 10Gi + # Mimir/Prometheus storage for metrics + mimir: 50Gi + # Loki additional storage + extra: 50Gi + + # LGTM stack main deployment resources + resources: + limits: + memory: 8Gi + requests: + memory: 2Gi + cpu: '1' + +# OpenTelemetry Collectors resource configuration +collectors: + resources: + # Metrics collector (deployment mode) - original values + metrics: + limits: + memory: 8Gi + cpu: '2' + requests: + memory: 1Gi + cpu: 500m + # Logs collector (daemonset mode) - original conservative values + logs: + limits: + memory: 2Gi + cpu: '1' + requests: + memory: 400Mi # Conservative default + cpu: 200m # Conservative default + +# Service configuration +services: + # Main LGTM stack service ports + lgtm: + grafana: 3000 + otelGrpc: 4317 + otelHttp: 4318 + prometheus: 9090 + loki: 3100 + # Kube state metrics service port + kubeStateMetrics: + http: 8080 + # Node exporter service port + nodeExporter: + metrics: 9100 + +# Component enablement +dashboards: + enabled: true + +nodeExporter: + enabled: true + +kubeStateMetrics: + enabled: true diff --git a/sources/otel-lgtm-stack/values_cf.yaml b/sources/otel-lgtm-stack/values_cf.yaml new file mode 100644 index 00000000..7a238b4c --- /dev/null +++ b/sources/otel-lgtm-stack/values_cf.yaml @@ -0,0 +1,74 @@ +# Cluster-forge specific configuration for OpenTelemetry LGTM Stack +# This file overrides values.yaml for cluster-forge deployments + +# Cluster identification - will be populated by root/values.yaml helmParameters +cluster: + name: # to be filled by cluster-forge app based on domain + +# Component enablement (cluster-forge defaults) +dashboards: + enabled: true + +nodeExporter: + enabled: true + +kubeStateMetrics: + enabled: true + +# Storage configuration optimized for cluster-forge +lgtm: + storage: + # Tempo storage for traces + tempo: 50Gi + # Loki storage for logs + loki: 50Gi + # Grafana storage for dashboards/config + grafana: 10Gi + # Mimir/Prometheus storage for metrics + mimir: 50Gi + # Loki additional storage + extra: 50Gi + + # LGTM stack main deployment resources + resources: + limits: + memory: 8Gi + requests: + memory: 2Gi + cpu: '1' + +# Resource configuration optimized for cluster-forge +collectors: + resources: + # Metrics collector (deployment mode) + metrics: + limits: + memory: 8Gi + cpu: '2' + requests: + memory: 1Gi + cpu: 500m + # Logs collector (daemonset mode) + logs: + limits: + memory: 2Gi + cpu: '1' + requests: + memory: 400Mi + cpu: 200m + +# Service configuration +services: + # Main LGTM stack service ports + lgtm: + grafana: 3000 + otelGrpc: 4317 + otelHttp: 4318 + prometheus: 9090 + loki: 3100 + # Kube state metrics service port + kubeStateMetrics: + http: 8080 + # Node exporter service port + nodeExporter: + metrics: 9100 \ No newline at end of file