From aa93501e852f249e1f0f5a53d7b6fdfd28e1bd2e Mon Sep 17 00:00:00 2001 From: Cory Waddingham Date: Mon, 15 Dec 2025 13:37:16 -0800 Subject: [PATCH] Enhance k8s debugging script with additional diagnostics - Add strict error handling and colored output logging - Add configurable log window (--since) and optional AWS ALB diagnostics - Capture additional resources: nodes, ingress, services, endpoints, PVCs, controllers - Add comprehensive validation and graceful error handling - Generate summary file with capture metadata - Preserve all original functionality --- .../scripts/get_k8s_debugging_info.sh | 341 ++++++++++++++++-- 1 file changed, 304 insertions(+), 37 deletions(-) diff --git a/charts/langsmith/scripts/get_k8s_debugging_info.sh b/charts/langsmith/scripts/get_k8s_debugging_info.sh index ea63d235..e84d9692 100644 --- a/charts/langsmith/scripts/get_k8s_debugging_info.sh +++ b/charts/langsmith/scripts/get_k8s_debugging_info.sh @@ -1,58 +1,325 @@ -#!/bin/bash +#!/usr/bin/env bash +set -euo pipefail -# We expect the namespace hosting all kubernetes resources to be passed as an argument to this script -while [[ "$#" -gt 0 ]]; do +# ----------------------------- +# LangSmith k8s debugging bundle +# Canonical script (merge of helm + reference) +# ----------------------------- + +# Colors +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' + +info() { printf "${BLUE}[INFO]${NC} %s\n" "$*"; } +warn() { printf "${YELLOW}[WARN]${NC} %s\n" "$*"; } +error() { printf "${RED}[ERROR]${NC} %s\n" "$*"; } +ok() { printf "${GREEN}[OK]${NC} %s\n" "$*"; } + +usage() { + cat <<'EOF' +Usage: + ./get_k8s_debugging_info.sh --namespace [--since ] [--aws-region ] [--include-aws] + +Options: + --namespace Kubernetes namespace (required) + --since Log lookback window for current logs (default: 24h) + --aws-region AWS region for ELBv2 calls (default: $AWS_REGION or us-west-2) + --include-aws Attempt AWS ALB/target-group diagnostics (requires aws CLI configured) + +Notes: + - This script reads cluster state and collects logs; it does not mutate Kubernetes resources. + - Bundles output to .zip when available, else .tar.gz. +EOF +} + +NS="" +LOG_SINCE="24h" +AWS_REGION_DEFAULT="${AWS_REGION:-us-west-2}" +INCLUDE_AWS=false + +while [[ $# -gt 0 ]]; do case $1 in - --namespace) NS="$2"; shift ;; - *) echo "Unknown parameter passed: $1"; exit 1 ;; + --namespace) NS="${2:-}"; shift 2 ;; + --since) LOG_SINCE="${2:-}"; shift 2 ;; + --aws-region) AWS_REGION_DEFAULT="${2:-}"; shift 2 ;; + --include-aws) INCLUDE_AWS=true; shift ;; + -h|--help) usage; exit 0 ;; + *) error "Unknown parameter: $1"; usage; exit 1 ;; esac - shift done -if [ -z "$NS" ]; then - echo "Usage: $0 --namespace " +if [[ -z "${NS}" ]]; then + error "Missing --namespace" + usage exit 1 fi -DIR=/tmp/langchain-debugging-$(date +%Y%m%d%H%M%S) +if ! command -v kubectl >/dev/null 2>&1; then + error "kubectl not found in PATH" + exit 1 +fi -echo "Starting to pull debugging info. Creating directory $DIR..." -mkdir -p "$DIR" +if ! kubectl get namespace "${NS}" >/dev/null 2>&1; then + error "Namespace '${NS}' does not exist or you do not have access." + exit 1 +fi -echo "Pulling summary of resources..." -kubectl get all -n "$NS" -o wide > "$DIR/resources_summary.txt" +if ! command -v jq >/dev/null 2>&1; then + warn "jq not found. Per-container restart counting will be degraded (previous logs may be incomplete)." +fi -echo "Pulling details of all resources..." -kubectl get all -n "$NS" -o yaml > "$DIR/resources_details.yaml" +DIR="/tmp/langchain-debugging-$(date +%Y%m%d%H%M%S)" +mkdir -p "${DIR}" +mkdir -p "${DIR}/logs" -echo "Pulling kubernetes events..." -kubectl get events -n "$NS" --sort-by=.lastTimestamp > "$DIR/events.txt" +info "Starting k8s debugging capture" +info "Namespace: ${NS}" +info "Output directory: ${DIR}" +info "Log window: --since=${LOG_SINCE}" -echo "Pulling resource usage for all pods..." -kubectl top pods -n "$NS" --containers > "$DIR/pod-resource-usage.txt" +capture() { + local desc="$1" + local cmd="$2" + local out="$3" -echo "Pulling container logs for all pods. Also pulling previous logs from restarted containers..." -mkdir -p "$DIR/logs" -PODS=$(kubectl get pods -n "$NS" -o jsonpath='{.items[*].metadata.name}') + info "Capturing: ${desc}" + if bash -c "${cmd}" > "${DIR}/${out}" 2>&1; then + ok "Saved: ${out}" + else + warn "Failed: ${desc} (see ${out})" + fi +} -for POD in $PODS; do - CONTAINERS=$(kubectl get pod "$POD" -n "$NS" -o jsonpath='{.spec.containers[*].name}') - for CONTAINER in $CONTAINERS; do - echo "Pulling current container logs (last 24h) for $POD/$CONTAINER..." - kubectl logs -n "$NS" "$POD" -c "$CONTAINER" --since=24h > "$DIR/logs/${POD}_${CONTAINER}_current.log" 2>/dev/null +# --- Core (helm script behavior) --- +capture "Resources summary (get all -o wide)" \ + "kubectl get all -n \"${NS}\" -o wide" \ + "resources_summary.txt" - RESTART_COUNT=$(kubectl get pod "$POD" -n "$NS" -o json | jq ".status.containerStatuses[] | select(.name==\"$CONTAINER\") | .restartCount // 0") - if [[ "$RESTART_COUNT" -gt 0 ]]; then - echo " $POD/$CONTAINER restarted ($RESTART_COUNT times) — grabbing previous logs..." - kubectl logs -n "$NS" "$POD" -c "$CONTAINER" --previous > "$DIR/logs/${POD}_${CONTAINER}_previous.log" 2>/dev/null - fi +capture "Resources details (get all -o yaml)" \ + "kubectl get all -n \"${NS}\" -o yaml" \ + "resources_details.yaml" + +capture "Kubernetes events" \ + "kubectl get events -n \"${NS}\" --sort-by=.lastTimestamp" \ + "events.txt" + +# Pod metrics (containers) +if kubectl top pods -n "${NS}" --containers >/dev/null 2>&1; then + capture "Pod resource usage (kubectl top pods --containers)" \ + "kubectl top pods -n \"${NS}\" --containers" \ + "pod-resource-usage.txt" +else + warn "Metrics not available for pods (metrics-server likely missing). Skipping pod-resource-usage.txt" +fi + +# --- Added: nodes + top nodes (from capture-diagnostics) --- +capture "Node list (wide)" \ + "kubectl get nodes -o wide" \ + "nodes-wide.txt" + +if kubectl top nodes >/dev/null 2>&1; then + capture "Node resource usage (kubectl top nodes)" \ + "kubectl top nodes" \ + "nodes-top.txt" +else + warn "Metrics not available for nodes (metrics-server likely missing). Skipping nodes-top.txt" +fi + +# --- Added: ingress/service/endpoints (from capture-diagnostics) --- +capture "Ingress resources (wide)" \ + "kubectl get ingress -n \"${NS}\" -o wide" \ + "ingress-list.txt" + +INGRESS_RESOURCES="$(kubectl get ingress -n "${NS}" -o jsonpath='{.items[*].metadata.name}' 2>/dev/null || true)" +if [[ -n "${INGRESS_RESOURCES}" ]]; then + for INGRESS in ${INGRESS_RESOURCES}; do + capture "Ingress describe: ${INGRESS}" \ + "kubectl describe ingress \"${INGRESS}\" -n \"${NS}\"" \ + "ingress-${INGRESS}-describe.txt" + capture "Ingress YAML: ${INGRESS}" \ + "kubectl get ingress \"${INGRESS}\" -n \"${NS}\" -o yaml" \ + "ingress-${INGRESS}.yaml" done -done +else + warn "No ingresses found in namespace ${NS}" +fi + +capture "Service list (wide)" \ + "kubectl get svc -n \"${NS}\" -o wide" \ + "services-list.txt" + +capture "Endpoints" \ + "kubectl get endpoints -n \"${NS}\"" \ + "endpoints.txt" + +SERVICES="$(kubectl get svc -n "${NS}" -o jsonpath='{.items[*].metadata.name}' 2>/dev/null || true)" +if [[ -n "${SERVICES}" ]]; then + for SVC in ${SERVICES}; do + capture "Service describe: ${SVC}" \ + "kubectl describe svc \"${SVC}\" -n \"${NS}\"" \ + "svc-${SVC}-describe.txt" + done +fi + +# --- Added: PVC + controllers (from capture-diagnostics) --- +capture "PersistentVolumeClaims" \ + "kubectl get pvc -n \"${NS}\" -o wide" \ + "pvc-list.txt" + +capture "StatefulSets (wide)" \ + "kubectl get statefulsets -n \"${NS}\" -o wide" \ + "statefulsets.txt" + +capture "Deployments (wide)" \ + "kubectl get deployments -n \"${NS}\" -o wide" \ + "deployments.txt" + +# --- Logs (helm script behavior: per pod/container, current + previous) --- +info "Capturing container logs for all pods/containers (current + previous where restarted)..." +PODS="$(kubectl get pods -n "${NS}" -o jsonpath='{.items[*].metadata.name}' 2>/dev/null || true)" + +if [[ -z "${PODS}" ]]; then + warn "No pods found in namespace ${NS}; skipping logs." +else + for POD in ${PODS}; do + CONTAINERS="$(kubectl get pod "${POD}" -n "${NS}" -o jsonpath='{.spec.containers[*].name}' 2>/dev/null || true)" + for CONTAINER in ${CONTAINERS}; do + info "Logs (current, since ${LOG_SINCE}): ${POD}/${CONTAINER}" + kubectl logs -n "${NS}" "${POD}" -c "${CONTAINER}" --since="${LOG_SINCE}" \ + > "${DIR}/logs/${POD}_${CONTAINER}_current.log" 2>/dev/null || true + + # Previous logs when restarted + if command -v jq >/dev/null 2>&1; then + RESTART_COUNT="$(kubectl get pod "${POD}" -n "${NS}" -o json \ + | jq -r ".status.containerStatuses[] | select(.name==\"${CONTAINER}\") | .restartCount // 0" 2>/dev/null || echo 0)" + if [[ "${RESTART_COUNT}" -gt 0 ]]; then + info "Logs (previous): ${POD}/${CONTAINER} restarted (${RESTART_COUNT})" + kubectl logs -n "${NS}" "${POD}" -c "${CONTAINER}" --previous \ + > "${DIR}/logs/${POD}_${CONTAINER}_previous.log" 2>/dev/null || true + fi + else + # Fallback: try previous logs anyway (may fail noisily; we ignore) + kubectl logs -n "${NS}" "${POD}" -c "${CONTAINER}" --previous \ + > "${DIR}/logs/${POD}_${CONTAINER}_previous.log" 2>/dev/null || true + fi + done + done +fi + +# --- Added: AWS ALB target group health (from capture-diagnostics) --- +# Gated behind --include-aws to avoid surprising failures in restricted environments. +if [[ "${INCLUDE_AWS}" == "true" ]]; then + if command -v aws >/dev/null 2>&1; then + info "AWS diagnostics enabled. Attempting ALB target group + target health capture..." + if [[ -n "${INGRESS_RESOURCES}" ]]; then + for INGRESS in ${INGRESS_RESOURCES}; do + # Best-effort ways to identify the ALB: + # 1) alb.ingress.kubernetes.io/load-balancer-name + # 2) alb.ingress.kubernetes.io/load-balancer-id (nonstandard; keep for backward compatibility) + # 3) .status.loadBalancer.ingress[].hostname (resolve to ALB via AWS) + LB_NAME="$(kubectl get ingress "${INGRESS}" -n "${NS}" -o jsonpath='{.metadata.annotations.alb\.ingress\.kubernetes\.io/load-balancer-name}' 2>/dev/null || true)" + LB_ID_ANN="$(kubectl get ingress "${INGRESS}" -n "${NS}" -o jsonpath='{.metadata.annotations.alb\.ingress\.kubernetes\.io/load-balancer-id}' 2>/dev/null || true)" + LB_HOST="$(kubectl get ingress "${INGRESS}" -n "${NS}" -o jsonpath='{.status.loadBalancer.ingress[0].hostname}' 2>/dev/null || true)" + + ALB_ARN="" + + if [[ -n "${LB_ID_ANN}" ]] && [[ "${LB_ID_ANN}" == arn:aws:elasticloadbalancing:* ]]; then + ALB_ARN="${LB_ID_ANN}" + elif [[ -n "${LB_NAME}" ]]; then + ALB_ARN="$(aws elbv2 describe-load-balancers --names "${LB_NAME}" --region "${AWS_REGION_DEFAULT}" \ + --query 'LoadBalancers[0].LoadBalancerArn' --output text 2>/dev/null || true)" + elif [[ -n "${LB_HOST}" ]]; then + # LB_HOST usually contains the ALB DNS name; find matching LB by DNSName. + ALB_ARN="$(aws elbv2 describe-load-balancers --region "${AWS_REGION_DEFAULT}" \ + --query "LoadBalancers[?DNSName=='${LB_HOST}'].LoadBalancerArn | [0]" --output text 2>/dev/null || true)" + fi + + if [[ -z "${ALB_ARN}" || "${ALB_ARN}" == "None" ]]; then + warn "Could not resolve ALB ARN for ingress ${INGRESS} (name='${LB_NAME}', host='${LB_HOST}'). Skipping AWS checks." + continue + fi + + capture "ALB info (ARN) for ingress ${INGRESS}" \ + "printf '%s\n' '${ALB_ARN}'" \ + "alb-${INGRESS}-info.txt" + + # Target groups + if aws elbv2 describe-target-groups --load-balancer-arn "${ALB_ARN}" --region "${AWS_REGION_DEFAULT}" >/dev/null 2>&1; then + capture "ALB target groups for ${INGRESS}" \ + "aws elbv2 describe-target-groups --load-balancer-arn \"${ALB_ARN}\" --region \"${AWS_REGION_DEFAULT}\"" \ + "alb-${INGRESS}-target-groups.json" + + TARGET_GROUPS="$(aws elbv2 describe-target-groups --load-balancer-arn "${ALB_ARN}" --region "${AWS_REGION_DEFAULT}" \ + --query 'TargetGroups[*].TargetGroupArn' --output text 2>/dev/null || true)" + + if [[ -n "${TARGET_GROUPS}" ]]; then + for TG_ARN in ${TARGET_GROUPS}; do + TG_ID="${TG_ARN##*/}" # safer than basename for ARNs + capture "Target health for ${INGRESS} TG=${TG_ARN}" \ + "aws elbv2 describe-target-health --target-group-arn \"${TG_ARN}\" --region \"${AWS_REGION_DEFAULT}\"" \ + "alb-${INGRESS}-target-health-${TG_ID}.json" + done + else + warn "No target groups found for ALB ${ALB_ARN}" + fi + else + warn "AWS call failed: describe-target-groups for ALB ${ALB_ARN} (region ${AWS_REGION_DEFAULT})" + fi + done + else + warn "No ingresses found; skipping AWS ALB diagnostics." + fi + else + warn "aws CLI not found; skipping AWS diagnostics." + fi +else + info "AWS diagnostics disabled (pass --include-aws to enable)." +fi + +# --- Added: summary.txt (from capture-diagnostics) --- +SUMMARY_FILE="${DIR}/summary.txt" +{ + echo "LangSmith Self-Hosted Diagnostics Summary" + echo "========================================" + echo "Timestamp: $(date -u +'%Y-%m-%dT%H:%M:%SZ')" + echo "Namespace: ${NS}" + echo "Output Directory: ${DIR}" + echo "" + echo "Configuration:" + echo " LOG_SINCE: ${LOG_SINCE}" + echo " AWS_REGION: ${AWS_REGION_DEFAULT}" + echo " INCLUDE_AWS: ${INCLUDE_AWS}" + echo "" + echo "Captured Information:" + echo " - kubectl get all (wide + yaml)" + echo " - events" + echo " - pod metrics (if available)" + echo " - per-container logs (current since window + previous on restart)" + echo " - ingress list + describe + yaml" + echo " - services list + describe" + echo " - endpoints" + echo " - nodes list + node metrics (if available)" + echo " - pvc list" + echo " - statefulsets + deployments" + if [[ "${INCLUDE_AWS}" == "true" ]] && command -v aws >/dev/null 2>&1; then + echo " - ALB target groups + target health (best-effort)" + fi + echo "" + echo "Files captured:" + find "${DIR}" -type f | sort | sed 's|^| |' +} > "${SUMMARY_FILE}" + +ok "Diagnostics capture complete." +ok "Summary: ${SUMMARY_FILE}" -echo "Compressing directory..." +# --- Bundle (helm script behavior) --- +info "Compressing directory..." if command -v zip >/dev/null 2>&1; then - zip -r "${DIR}.zip" "$DIR" >/dev/null && echo "Bundle written to ${DIR}.zip" + zip -r "${DIR}.zip" "${DIR}" >/dev/null && ok "Bundle written to ${DIR}.zip" else - echo "Unable to use zip, falling back to tar.gz. We encourage installing zip if possible to allow uploading via Slack." - tar -czf "${DIR}.tar.gz" -C "$(dirname "$DIR")" "$(basename "$DIR")" && echo "Bundle written to ${DIR}.tar.gz" + warn "zip not available; falling back to tar.gz (zip is nicer for Slack uploads)." + tar -czf "${DIR}.tar.gz" -C "$(dirname "${DIR}")" "$(basename "${DIR}")" && ok "Bundle written to ${DIR}.tar.gz" fi