|
| 1 | +package io.argus.cli.cluster; |
| 2 | + |
| 3 | +import java.util.List; |
| 4 | + |
| 5 | +/** |
| 6 | + * Aggregates health metrics across multiple JVM instances. |
| 7 | + */ |
| 8 | +public final class ClusterHealthAggregator { |
| 9 | + |
| 10 | + /** Known Prometheus metric name candidates for each dimension. */ |
| 11 | + private static final String[] HEAP_METRICS = { |
| 12 | + "argus_heap_used_percent", |
| 13 | + "jvm_memory_used_bytes", |
| 14 | + "heap_used_percent" |
| 15 | + }; |
| 16 | + private static final String[] GC_METRICS = { |
| 17 | + "argus_gc_overhead_percent", |
| 18 | + "jvm_gc_overhead_percent", |
| 19 | + "gc_overhead_percent" |
| 20 | + }; |
| 21 | + private static final String[] CPU_METRICS = { |
| 22 | + "argus_cpu_process_percent", |
| 23 | + "jvm_cpu_usage", |
| 24 | + "process_cpu_usage" |
| 25 | + }; |
| 26 | + private static final String[] VT_METRICS = { |
| 27 | + "argus_virtual_threads_active", |
| 28 | + "jvm_virtual_threads_active", |
| 29 | + "virtual_threads_active" |
| 30 | + }; |
| 31 | + private static final String[] LEAK_METRICS = { |
| 32 | + "argus_memory_leak_suspected", |
| 33 | + "memory_leak_suspected" |
| 34 | + }; |
| 35 | + |
| 36 | + private ClusterHealthAggregator() {} |
| 37 | + |
| 38 | + public record InstanceMetrics( |
| 39 | + String target, |
| 40 | + double heapPercent, |
| 41 | + double gcOverhead, |
| 42 | + double cpuPercent, |
| 43 | + boolean leakSuspected, |
| 44 | + long activeVThreads, |
| 45 | + boolean reachable |
| 46 | + ) {} |
| 47 | + |
| 48 | + public record AggregateStats( |
| 49 | + double heapMin, double heapMax, double heapAvg, |
| 50 | + double gcMin, double gcMax, double gcAvg, |
| 51 | + double cpuMin, double cpuMax, double cpuAvg, |
| 52 | + long vtTotal, |
| 53 | + int leakCount, |
| 54 | + String worstTarget, |
| 55 | + String worstReason |
| 56 | + ) {} |
| 57 | + |
| 58 | + /** |
| 59 | + * Extracts per-instance metrics from the raw Prometheus map. |
| 60 | + */ |
| 61 | + public static InstanceMetrics extract(String target, java.util.Map<String, Double> raw) { |
| 62 | + double heap = pick(raw, HEAP_METRICS, -1.0); |
| 63 | + double gc = pick(raw, GC_METRICS, -1.0); |
| 64 | + double cpu = pick(raw, CPU_METRICS, -1.0); |
| 65 | + double vt = pick(raw, VT_METRICS, 0.0); |
| 66 | + double leak = pick(raw, LEAK_METRICS, 0.0); |
| 67 | + return new InstanceMetrics(target, heap, gc, cpu, leak > 0.5, (long) vt, true); |
| 68 | + } |
| 69 | + |
| 70 | + /** |
| 71 | + * Computes aggregate statistics from a list of reachable instance metrics. |
| 72 | + */ |
| 73 | + public static AggregateStats aggregate(List<InstanceMetrics> instances) { |
| 74 | + List<InstanceMetrics> up = instances.stream().filter(InstanceMetrics::reachable).toList(); |
| 75 | + if (up.isEmpty()) { |
| 76 | + return new AggregateStats(-1,-1,-1,-1,-1,-1,-1,-1,-1,0,0,null,"no reachable instances"); |
| 77 | + } |
| 78 | + |
| 79 | + double heapMin = Double.MAX_VALUE, heapMax = -1, heapSum = 0; |
| 80 | + double gcMin = Double.MAX_VALUE, gcMax = -1, gcSum = 0; |
| 81 | + double cpuMin = Double.MAX_VALUE, cpuMax = -1, cpuSum = 0; |
| 82 | + long vtTotal = 0; |
| 83 | + int leakCount = 0; |
| 84 | + int heapCnt = 0, gcCnt = 0, cpuCnt = 0; |
| 85 | + |
| 86 | + for (InstanceMetrics m : up) { |
| 87 | + if (m.heapPercent() >= 0) { |
| 88 | + heapMin = Math.min(heapMin, m.heapPercent()); |
| 89 | + heapMax = Math.max(heapMax, m.heapPercent()); |
| 90 | + heapSum += m.heapPercent(); |
| 91 | + heapCnt++; |
| 92 | + } |
| 93 | + if (m.gcOverhead() >= 0) { |
| 94 | + gcMin = Math.min(gcMin, m.gcOverhead()); |
| 95 | + gcMax = Math.max(gcMax, m.gcOverhead()); |
| 96 | + gcSum += m.gcOverhead(); |
| 97 | + gcCnt++; |
| 98 | + } |
| 99 | + if (m.cpuPercent() >= 0) { |
| 100 | + cpuMin = Math.min(cpuMin, m.cpuPercent()); |
| 101 | + cpuMax = Math.max(cpuMax, m.cpuPercent()); |
| 102 | + cpuSum += m.cpuPercent(); |
| 103 | + cpuCnt++; |
| 104 | + } |
| 105 | + vtTotal += m.activeVThreads(); |
| 106 | + if (m.leakSuspected()) leakCount++; |
| 107 | + } |
| 108 | + |
| 109 | + // Worst instance: highest GC overhead, then heap |
| 110 | + InstanceMetrics worst = up.stream() |
| 111 | + .filter(m -> m.gcOverhead() >= 0) |
| 112 | + .max(java.util.Comparator.comparingDouble(InstanceMetrics::gcOverhead)) |
| 113 | + .or(() -> up.stream() |
| 114 | + .filter(m -> m.heapPercent() >= 0) |
| 115 | + .max(java.util.Comparator.comparingDouble(InstanceMetrics::heapPercent))) |
| 116 | + .orElse(up.get(0)); |
| 117 | + |
| 118 | + StringBuilder reason = new StringBuilder(); |
| 119 | + if (worst.gcOverhead() >= 0) { |
| 120 | + reason.append(String.format("GC overhead %.1f%%", worst.gcOverhead())); |
| 121 | + } |
| 122 | + if (worst.leakSuspected()) { |
| 123 | + if (!reason.isEmpty()) reason.append(", "); |
| 124 | + reason.append("memory leak suspected"); |
| 125 | + } |
| 126 | + |
| 127 | + return new AggregateStats( |
| 128 | + heapCnt > 0 ? heapMin : -1, heapCnt > 0 ? heapMax : -1, heapCnt > 0 ? heapSum / heapCnt : -1, |
| 129 | + gcCnt > 0 ? gcMin : -1, gcCnt > 0 ? gcMax : -1, gcCnt > 0 ? gcSum / gcCnt : -1, |
| 130 | + cpuCnt > 0 ? cpuMin : -1, cpuCnt > 0 ? cpuMax : -1, cpuCnt > 0 ? cpuSum / cpuCnt : -1, |
| 131 | + vtTotal, leakCount, worst.target(), reason.toString() |
| 132 | + ); |
| 133 | + } |
| 134 | + |
| 135 | + private static double pick(java.util.Map<String, Double> map, String[] keys, double def) { |
| 136 | + for (String key : keys) { |
| 137 | + Double v = map.get(key); |
| 138 | + if (v != null) return v; |
| 139 | + } |
| 140 | + return def; |
| 141 | + } |
| 142 | +} |
0 commit comments