From 0a02da49a45d5383c51700444bbfe19d817350ca Mon Sep 17 00:00:00 2001 From: teddy Date: Tue, 9 Jun 2026 23:45:02 +0200 Subject: [PATCH] monitoring: add Prometheus + Grafana stack (kube-prometheus-stack) Deploys kube-prometheus-stack via helm-controller: Prometheus, Grafana (internal-only), node-exporter, kube-state-metrics. Adds prometheus-pve-exporter for Proxmox (token in an out-of-band Secret), scrape configs for external hosts (.48/.70/.71/.49), community dashboards (Node Exporter Full, Proxmox via Prometheus) and a custom Hardware Temperatures dashboard. Grafana liveness made tolerant for slow first-boot migrations. Co-Authored-By: Claude Opus 4.8 (1M context) --- monitoring/grafana-temps-dashboard.yaml | 91 ++++++++++++++++ monitoring/monitoring.yaml | 131 ++++++++++++++++++++++++ monitoring/pve-exporter.yaml | 53 ++++++++++ 3 files changed, 275 insertions(+) create mode 100644 monitoring/grafana-temps-dashboard.yaml create mode 100644 monitoring/monitoring.yaml create mode 100644 monitoring/pve-exporter.yaml diff --git a/monitoring/grafana-temps-dashboard.yaml b/monitoring/grafana-temps-dashboard.yaml new file mode 100644 index 0000000..3e191c2 --- /dev/null +++ b/monitoring/grafana-temps-dashboard.yaml @@ -0,0 +1,91 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-dashboard-temps + namespace: monitoring + labels: + grafana_dashboard: "1" +data: + hardware-temperatures.json: | + { + "annotations": {"list": []}, + "editable": true, + "graphTooltip": 0, + "links": [], + "panels": [ + { + "datasource": {"type": "prometheus", "uid": "${datasource}"}, + "fieldConfig": { + "defaults": { + "color": {"mode": "thresholds"}, + "unit": "celsius", + "thresholds": {"mode": "absolute", "steps": [ + {"color": "green", "value": null}, + {"color": "yellow", "value": 60}, + {"color": "orange", "value": 75}, + {"color": "red", "value": 85} + ]} + }, + "overrides": [] + }, + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 0}, + "id": 1, + "options": { + "colorMode": "background", + "graphMode": "area", + "orientation": "auto", + "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}, + "textMode": "auto" + }, + "pluginVersion": "11.0.0", + "title": "Max temperature per host (now)", + "type": "stat", + "targets": [ + {"datasource": {"type": "prometheus", "uid": "${datasource}"}, + "expr": "max by (instance) (node_hwmon_temp_celsius)", + "legendFormat": "{{instance}}", "refId": "A"} + ] + }, + { + "datasource": {"type": "prometheus", "uid": "${datasource}"}, + "fieldConfig": { + "defaults": { + "color": {"mode": "palette-classic"}, + "unit": "celsius", + "custom": {"drawStyle": "line", "lineWidth": 1, "fillOpacity": 10, "showPoints": "never"}, + "thresholds": {"mode": "absolute", "steps": [ + {"color": "green", "value": null}, + {"color": "red", "value": 85} + ]} + }, + "overrides": [] + }, + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 0}, + "id": 2, + "options": { + "legend": {"displayMode": "table", "placement": "bottom", "calcs": ["last", "max"]}, + "tooltip": {"mode": "multi"} + }, + "title": "All temperature sensors", + "type": "timeseries", + "targets": [ + {"datasource": {"type": "prometheus", "uid": "${datasource}"}, + "expr": "node_hwmon_temp_celsius", + "legendFormat": "{{instance}} / {{chip}} / {{sensor}}", "refId": "A"} + ] + } + ], + "refresh": "30s", + "schemaVersion": 39, + "tags": ["hardware", "temperature"], + "templating": {"list": [ + {"current": {}, "hide": 0, "includeAll": false, "label": "Datasource", + "multi": false, "name": "datasource", "options": [], "query": "prometheus", + "refresh": 1, "type": "datasource"} + ]}, + "time": {"from": "now-6h", "to": "now"}, + "timezone": "Europe/Oslo", + "title": "Hardware Temperatures", + "uid": "hw-temps", + "version": 1 + } diff --git a/monitoring/monitoring.yaml b/monitoring/monitoring.yaml new file mode 100644 index 0000000..79c3ea6 --- /dev/null +++ b/monitoring/monitoring.yaml @@ -0,0 +1,131 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: monitoring +--- +# kube-prometheus-stack: Prometheus + Grafana + node-exporter + kube-state-metrics. +# Deployed via k3s helm-controller. Grafana is internal-only (ClusterIP) for now; +# an ingress stub is commented at the bottom for when you want grafana.roysland.net. +apiVersion: helm.cattle.io/v1 +kind: HelmChart +metadata: + name: kube-prometheus-stack + namespace: kube-system +spec: + chart: kube-prometheus-stack + repo: https://prometheus-community.github.io/helm-charts + targetNamespace: monitoring + valuesContent: |- + # Alertmanager off for now (no notification channel configured yet). + alertmanager: + enabled: false + + grafana: + adminPassword: "roysland-grafana-changeme" + defaultDashboardsTimezone: Europe/Oslo + service: + type: ClusterIP + # Grafana 13 first-boot migrations are slow on the local-path disk; be + # tolerant so the liveness probe doesn't kill it mid-migration. + livenessProbe: + initialDelaySeconds: 120 + timeoutSeconds: 30 + failureThreshold: 30 + persistence: + enabled: true + storageClassName: local-path + size: 2Gi + # Sidecar loads any ConfigMap labelled grafana_dashboard across namespaces + # (used by the custom Temperatures dashboard). + sidecar: + dashboards: + enabled: true + searchNamespace: ALL + # Auto-provisioned community dashboards (downloaded from grafana.com on start). + dashboardProviders: + dashboardproviders.yaml: + apiVersion: 1 + providers: + - name: default + orgId: 1 + folder: "" + type: file + disableDeletion: false + editable: true + options: + path: /var/lib/grafana/dashboards/default + dashboards: + default: + node-exporter-full: + gnetId: 1860 # Node Exporter Full (CPU/RAM/disk/net + temps), pick host via $instance + revision: 45 + datasource: Prometheus + proxmox: + gnetId: 10347 # Proxmox via Prometheus (pve-exporter) + revision: 5 + datasource: Prometheus + + # Node metrics for the k3s host itself. + nodeExporter: + enabled: true + kube-state-metrics: + enabled: true + + prometheus: + prometheusSpec: + retention: 15d + scrapeInterval: 30s + # Persist metrics on the local-path PVC. + storageSpec: + volumeClaimTemplate: + spec: + storageClassName: local-path + accessModes: ["ReadWriteOnce"] + resources: + requests: + storage: 10Gi + # Pick up ServiceMonitors/PodMonitors from any namespace, not just chart-labelled ones. + serviceMonitorSelectorNilUsesHelmValues: false + podMonitorSelectorNilUsesHelmValues: false + # External targets: Proxmox host + the other VMs + test server. + additionalScrapeConfigs: + # node_exporter on the Proxmox host and the other servers (installed via ansible). + - job_name: node-external + static_configs: + - targets: + - 192.168.50.48:9100 # proxmox host (master) + - 192.168.50.70:9100 # minio + - 192.168.50.71:9100 # forgejo + - 192.168.50.49:9100 # test server + labels: + group: infra-hosts + # Proxmox VE stats via prometheus-pve-exporter (deployed in-cluster). + - job_name: proxmox-pve + metrics_path: /pve + params: + module: [default] + static_configs: + - targets: + - 192.168.50.48 # the PVE node to query + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: prometheus-pve-exporter.monitoring.svc.cluster.local:9221 +# --- +# To expose Grafana later at grafana.roysland.net, set grafana.ingress in the +# values above: +# grafana: +# ingress: +# enabled: true +# ingressClassName: traefik +# annotations: +# cert-manager.io/cluster-issuer: letsencrypt +# traefik.ingress.kubernetes.io/router.middlewares: monitoring-https-redirect@kubernetescrd +# hosts: [grafana.roysland.net] +# tls: +# - hosts: [grafana.roysland.net] +# secretName: grafana-tls +# (and add a Middleware named https-redirect in the monitoring namespace) diff --git a/monitoring/pve-exporter.yaml b/monitoring/pve-exporter.yaml new file mode 100644 index 0000000..dd5369a --- /dev/null +++ b/monitoring/pve-exporter.yaml @@ -0,0 +1,53 @@ +# prometheus-pve-exporter — exposes Proxmox VE metrics for Prometheus. +# The API token lives in the `pve-exporter` Secret, created out-of-band (NOT in git): +# kubectl -n monitoring create secret generic pve-exporter \ +# --from-literal=PVE_USER=monitoring@pve \ +# --from-literal=PVE_TOKEN_NAME=prometheus \ +# --from-literal=PVE_TOKEN_VALUE= \ +# --from-literal=PVE_VERIFY_SSL=false +apiVersion: apps/v1 +kind: Deployment +metadata: + name: prometheus-pve-exporter + namespace: monitoring + labels: + app: prometheus-pve-exporter +spec: + replicas: 1 + selector: + matchLabels: + app: prometheus-pve-exporter + template: + metadata: + labels: + app: prometheus-pve-exporter + spec: + containers: + - name: pve-exporter + image: prompve/prometheus-pve-exporter:3.9.0 + envFrom: + - secretRef: + name: pve-exporter + ports: + - containerPort: 9221 + name: http + securityContext: + runAsNonRoot: true + runAsUser: 1000 + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + seccompProfile: + type: RuntimeDefault +--- +apiVersion: v1 +kind: Service +metadata: + name: prometheus-pve-exporter + namespace: monitoring +spec: + selector: + app: prometheus-pve-exporter + ports: + - port: 9221 + targetPort: 9221