monitoring: add Prometheus + Grafana stack (kube-prometheus-stack)

Deploys kube-prometheus-stack via helm-controller: Prometheus, Grafana (internal-only), node-exporter, kube-state-metrics. Adds prometheus-pve-exporter for Proxmox (token in an out-of-band Secret), scrape configs for external hosts (.48/.70/.71/.49), community dashboards (Node Exporter Full, Proxmox via Prometheus) and a custom Hardware Temperatures dashboard. Grafana liveness made tolerant for slow first-boot migrations. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-09 23:45:02 +02:00 · 2026-06-09 23:45:02 +02:00 · 0a02da49a4
commit 0a02da49a4
parent a064f41250
3 changed files with 275 additions and 0 deletions
--- a/monitoring/grafana-temps-dashboard.yaml
+++ b/monitoring/grafana-temps-dashboard.yaml
@ -0,0 +1,91 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: grafana-dashboard-temps
+  namespace: monitoring
+  labels:
+    grafana_dashboard: "1"
+data:
+  hardware-temperatures.json: |
+    {
+      "annotations": {"list": []},
+      "editable": true,
+      "graphTooltip": 0,
+      "links": [],
+      "panels": [
+        {
+          "datasource": {"type": "prometheus", "uid": "${datasource}"},
+          "fieldConfig": {
+            "defaults": {
+              "color": {"mode": "thresholds"},
+              "unit": "celsius",
+              "thresholds": {"mode": "absolute", "steps": [
+                {"color": "green", "value": null},
+                {"color": "yellow", "value": 60},
+                {"color": "orange", "value": 75},
+                {"color": "red", "value": 85}
+              ]}
+            },
+            "overrides": []
+          },
+          "gridPos": {"h": 8, "w": 12, "x": 0, "y": 0},
+          "id": 1,
+          "options": {
+            "colorMode": "background",
+            "graphMode": "area",
+            "orientation": "auto",
+            "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false},
+            "textMode": "auto"
+          },
+          "pluginVersion": "11.0.0",
+          "title": "Max temperature per host (now)",
+          "type": "stat",
+          "targets": [
+            {"datasource": {"type": "prometheus", "uid": "${datasource}"},
+             "expr": "max by (instance) (node_hwmon_temp_celsius)",
+             "legendFormat": "{{instance}}", "refId": "A"}
+          ]
+        },
+        {
+          "datasource": {"type": "prometheus", "uid": "${datasource}"},
+          "fieldConfig": {
+            "defaults": {
+              "color": {"mode": "palette-classic"},
+              "unit": "celsius",
+              "custom": {"drawStyle": "line", "lineWidth": 1, "fillOpacity": 10, "showPoints": "never"},
+              "thresholds": {"mode": "absolute", "steps": [
+                {"color": "green", "value": null},
+                {"color": "red", "value": 85}
+              ]}
+            },
+            "overrides": []
+          },
+          "gridPos": {"h": 8, "w": 12, "x": 12, "y": 0},
+          "id": 2,
+          "options": {
+            "legend": {"displayMode": "table", "placement": "bottom", "calcs": ["last", "max"]},
+            "tooltip": {"mode": "multi"}
+          },
+          "title": "All temperature sensors",
+          "type": "timeseries",
+          "targets": [
+            {"datasource": {"type": "prometheus", "uid": "${datasource}"},
+             "expr": "node_hwmon_temp_celsius",
+             "legendFormat": "{{instance}} / {{chip}} / {{sensor}}", "refId": "A"}
+          ]
+        }
+      ],
+      "refresh": "30s",
+      "schemaVersion": 39,
+      "tags": ["hardware", "temperature"],
+      "templating": {"list": [
+        {"current": {}, "hide": 0, "includeAll": false, "label": "Datasource",
+         "multi": false, "name": "datasource", "options": [], "query": "prometheus",
+         "refresh": 1, "type": "datasource"}
+      ]},
+      "time": {"from": "now-6h", "to": "now"},
+      "timezone": "Europe/Oslo",
+      "title": "Hardware Temperatures",
+      "uid": "hw-temps",
+      "version": 1
+    }
--- a/monitoring/monitoring.yaml
+++ b/monitoring/monitoring.yaml
@ -0,0 +1,131 @@
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: monitoring
+---
+# kube-prometheus-stack: Prometheus + Grafana + node-exporter + kube-state-metrics.
+# Deployed via k3s helm-controller. Grafana is internal-only (ClusterIP) for now;
+# an ingress stub is commented at the bottom for when you want grafana.roysland.net.
+apiVersion: helm.cattle.io/v1
+kind: HelmChart
+metadata:
+  name: kube-prometheus-stack
+  namespace: kube-system
+spec:
+  chart: kube-prometheus-stack
+  repo: https://prometheus-community.github.io/helm-charts
+  targetNamespace: monitoring
+  valuesContent: |-
+    # Alertmanager off for now (no notification channel configured yet).
+    alertmanager:
+      enabled: false
+
+    grafana:
+      adminPassword: "roysland-grafana-changeme"
+      defaultDashboardsTimezone: Europe/Oslo
+      service:
+        type: ClusterIP
+      # Grafana 13 first-boot migrations are slow on the local-path disk; be
+      # tolerant so the liveness probe doesn't kill it mid-migration.
+      livenessProbe:
+        initialDelaySeconds: 120
+        timeoutSeconds: 30
+        failureThreshold: 30
+      persistence:
+        enabled: true
+        storageClassName: local-path
+        size: 2Gi
+      # Sidecar loads any ConfigMap labelled grafana_dashboard across namespaces
+      # (used by the custom Temperatures dashboard).
+      sidecar:
+        dashboards:
+          enabled: true
+          searchNamespace: ALL
+      # Auto-provisioned community dashboards (downloaded from grafana.com on start).
+      dashboardProviders:
+        dashboardproviders.yaml:
+          apiVersion: 1
+          providers:
+            - name: default
+              orgId: 1
+              folder: ""
+              type: file
+              disableDeletion: false
+              editable: true
+              options:
+                path: /var/lib/grafana/dashboards/default
+      dashboards:
+        default:
+          node-exporter-full:
+            gnetId: 1860       # Node Exporter Full (CPU/RAM/disk/net + temps), pick host via $instance
+            revision: 45
+            datasource: Prometheus
+          proxmox:
+            gnetId: 10347      # Proxmox via Prometheus (pve-exporter)
+            revision: 5
+            datasource: Prometheus
+
+    # Node metrics for the k3s host itself.
+    nodeExporter:
+      enabled: true
+    kube-state-metrics:
+      enabled: true
+
+    prometheus:
+      prometheusSpec:
+        retention: 15d
+        scrapeInterval: 30s
+        # Persist metrics on the local-path PVC.
+        storageSpec:
+          volumeClaimTemplate:
+            spec:
+              storageClassName: local-path
+              accessModes: ["ReadWriteOnce"]
+              resources:
+                requests:
+                  storage: 10Gi
+        # Pick up ServiceMonitors/PodMonitors from any namespace, not just chart-labelled ones.
+        serviceMonitorSelectorNilUsesHelmValues: false
+        podMonitorSelectorNilUsesHelmValues: false
+        # External targets: Proxmox host + the other VMs + test server.
+        additionalScrapeConfigs:
+          # node_exporter on the Proxmox host and the other servers (installed via ansible).
+          - job_name: node-external
+            static_configs:
+              - targets:
+                  - 192.168.50.48:9100   # proxmox host (master)
+                  - 192.168.50.70:9100   # minio
+                  - 192.168.50.71:9100   # forgejo
+                  - 192.168.50.49:9100   # test server
+                labels:
+                  group: infra-hosts
+          # Proxmox VE stats via prometheus-pve-exporter (deployed in-cluster).
+          - job_name: proxmox-pve
+            metrics_path: /pve
+            params:
+              module: [default]
+            static_configs:
+              - targets:
+                  - 192.168.50.48        # the PVE node to query
+            relabel_configs:
+              - source_labels: [__address__]
+                target_label: __param_target
+              - source_labels: [__param_target]
+                target_label: instance
+              - target_label: __address__
+                replacement: prometheus-pve-exporter.monitoring.svc.cluster.local:9221
+# ---
+# To expose Grafana later at grafana.roysland.net, set grafana.ingress in the
+# values above:
+#   grafana:
+#     ingress:
+#       enabled: true
+#       ingressClassName: traefik
+#       annotations:
+#         cert-manager.io/cluster-issuer: letsencrypt
+#         traefik.ingress.kubernetes.io/router.middlewares: monitoring-https-redirect@kubernetescrd
+#       hosts: [grafana.roysland.net]
+#       tls:
+#         - hosts: [grafana.roysland.net]
+#           secretName: grafana-tls
+#   (and add a Middleware named https-redirect in the monitoring namespace)
--- a/monitoring/pve-exporter.yaml
+++ b/monitoring/pve-exporter.yaml
@ -0,0 +1,53 @@
+# prometheus-pve-exporter — exposes Proxmox VE metrics for Prometheus.
+# The API token lives in the `pve-exporter` Secret, created out-of-band (NOT in git):
+#   kubectl -n monitoring create secret generic pve-exporter \
+#     --from-literal=PVE_USER=monitoring@pve \
+#     --from-literal=PVE_TOKEN_NAME=prometheus \
+#     --from-literal=PVE_TOKEN_VALUE=<token> \
+#     --from-literal=PVE_VERIFY_SSL=false
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: prometheus-pve-exporter
+  namespace: monitoring
+  labels:
+    app: prometheus-pve-exporter
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: prometheus-pve-exporter
+  template:
+    metadata:
+      labels:
+        app: prometheus-pve-exporter
+    spec:
+      containers:
+        - name: pve-exporter
+          image: prompve/prometheus-pve-exporter:3.9.0
+          envFrom:
+            - secretRef:
+                name: pve-exporter
+          ports:
+            - containerPort: 9221
+              name: http
+          securityContext:
+            runAsNonRoot: true
+            runAsUser: 1000
+            allowPrivilegeEscalation: false
+            capabilities:
+              drop: ["ALL"]
+            seccompProfile:
+              type: RuntimeDefault
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: prometheus-pve-exporter
+  namespace: monitoring
+spec:
+  selector:
+    app: prometheus-pve-exporter
+  ports:
+    - port: 9221
+      targetPort: 9221