monitoring: add Prometheus + Grafana stack (kube-prometheus-stack)

Deploys kube-prometheus-stack via helm-controller: Prometheus, Grafana (internal-only), node-exporter, kube-state-metrics. Adds prometheus-pve-exporter for Proxmox (token in an out-of-band Secret), scrape configs for external hosts (.48/.70/.71/.49), community dashboards (Node Exporter Full, Proxmox via Prometheus) and a custom Hardware Temperatures dashboard. Grafana liveness made tolerant for slow first-boot migrations. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-09 23:45:02 +02:00 · 2026-06-09 23:45:02 +02:00 · 0a02da49a4
commit 0a02da49a4
parent a064f41250
3 changed files with 275 additions and 0 deletions
--- a/monitoring/monitoring.yaml
+++ b/monitoring/monitoring.yaml
@ -0,0 +1,131 @@
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: monitoring
+---
+# kube-prometheus-stack: Prometheus + Grafana + node-exporter + kube-state-metrics.
+# Deployed via k3s helm-controller. Grafana is internal-only (ClusterIP) for now;
+# an ingress stub is commented at the bottom for when you want grafana.roysland.net.
+apiVersion: helm.cattle.io/v1
+kind: HelmChart
+metadata:
+  name: kube-prometheus-stack
+  namespace: kube-system
+spec:
+  chart: kube-prometheus-stack
+  repo: https://prometheus-community.github.io/helm-charts
+  targetNamespace: monitoring
+  valuesContent: |-
+    # Alertmanager off for now (no notification channel configured yet).
+    alertmanager:
+      enabled: false
+
+    grafana:
+      adminPassword: "roysland-grafana-changeme"
+      defaultDashboardsTimezone: Europe/Oslo
+      service:
+        type: ClusterIP
+      # Grafana 13 first-boot migrations are slow on the local-path disk; be
+      # tolerant so the liveness probe doesn't kill it mid-migration.
+      livenessProbe:
+        initialDelaySeconds: 120
+        timeoutSeconds: 30
+        failureThreshold: 30
+      persistence:
+        enabled: true
+        storageClassName: local-path
+        size: 2Gi
+      # Sidecar loads any ConfigMap labelled grafana_dashboard across namespaces
+      # (used by the custom Temperatures dashboard).
+      sidecar:
+        dashboards:
+          enabled: true
+          searchNamespace: ALL
+      # Auto-provisioned community dashboards (downloaded from grafana.com on start).
+      dashboardProviders:
+        dashboardproviders.yaml:
+          apiVersion: 1
+          providers:
+            - name: default
+              orgId: 1
+              folder: ""
+              type: file
+              disableDeletion: false
+              editable: true
+              options:
+                path: /var/lib/grafana/dashboards/default
+      dashboards:
+        default:
+          node-exporter-full:
+            gnetId: 1860       # Node Exporter Full (CPU/RAM/disk/net + temps), pick host via $instance
+            revision: 45
+            datasource: Prometheus
+          proxmox:
+            gnetId: 10347      # Proxmox via Prometheus (pve-exporter)
+            revision: 5
+            datasource: Prometheus
+
+    # Node metrics for the k3s host itself.
+    nodeExporter:
+      enabled: true
+    kube-state-metrics:
+      enabled: true
+
+    prometheus:
+      prometheusSpec:
+        retention: 15d
+        scrapeInterval: 30s
+        # Persist metrics on the local-path PVC.
+        storageSpec:
+          volumeClaimTemplate:
+            spec:
+              storageClassName: local-path
+              accessModes: ["ReadWriteOnce"]
+              resources:
+                requests:
+                  storage: 10Gi
+        # Pick up ServiceMonitors/PodMonitors from any namespace, not just chart-labelled ones.
+        serviceMonitorSelectorNilUsesHelmValues: false
+        podMonitorSelectorNilUsesHelmValues: false
+        # External targets: Proxmox host + the other VMs + test server.
+        additionalScrapeConfigs:
+          # node_exporter on the Proxmox host and the other servers (installed via ansible).
+          - job_name: node-external
+            static_configs:
+              - targets:
+                  - 192.168.50.48:9100   # proxmox host (master)
+                  - 192.168.50.70:9100   # minio
+                  - 192.168.50.71:9100   # forgejo
+                  - 192.168.50.49:9100   # test server
+                labels:
+                  group: infra-hosts
+          # Proxmox VE stats via prometheus-pve-exporter (deployed in-cluster).
+          - job_name: proxmox-pve
+            metrics_path: /pve
+            params:
+              module: [default]
+            static_configs:
+              - targets:
+                  - 192.168.50.48        # the PVE node to query
+            relabel_configs:
+              - source_labels: [__address__]
+                target_label: __param_target
+              - source_labels: [__param_target]
+                target_label: instance
+              - target_label: __address__
+                replacement: prometheus-pve-exporter.monitoring.svc.cluster.local:9221
+# ---
+# To expose Grafana later at grafana.roysland.net, set grafana.ingress in the
+# values above:
+#   grafana:
+#     ingress:
+#       enabled: true
+#       ingressClassName: traefik
+#       annotations:
+#         cert-manager.io/cluster-issuer: letsencrypt
+#         traefik.ingress.kubernetes.io/router.middlewares: monitoring-https-redirect@kubernetescrd
+#       hosts: [grafana.roysland.net]
+#       tls:
+#         - hosts: [grafana.roysland.net]
+#           secretName: grafana-tls
+#   (and add a Middleware named https-redirect in the monitoring namespace)