monitoring: add Prometheus + Grafana stack (kube-prometheus-stack)

Deploys kube-prometheus-stack via helm-controller: Prometheus, Grafana (internal-only), node-exporter, kube-state-metrics. Adds prometheus-pve-exporter for Proxmox (token in an out-of-band Secret), scrape configs for external hosts (.48/.70/.71/.49), community dashboards (Node Exporter Full, Proxmox via Prometheus) and a custom Hardware Temperatures dashboard. Grafana liveness made tolerant for slow first-boot migrations.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
teddy 2026-06-09 23:45:02 +02:00
parent a064f41250
commit 0a02da49a4
3 changed files with 275 additions and 0 deletions

131
monitoring/monitoring.yaml Normal file
View file

@ -0,0 +1,131 @@
apiVersion: v1
kind: Namespace
metadata:
name: monitoring
---
# kube-prometheus-stack: Prometheus + Grafana + node-exporter + kube-state-metrics.
# Deployed via k3s helm-controller. Grafana is internal-only (ClusterIP) for now;
# an ingress stub is commented at the bottom for when you want grafana.roysland.net.
apiVersion: helm.cattle.io/v1
kind: HelmChart
metadata:
name: kube-prometheus-stack
namespace: kube-system
spec:
chart: kube-prometheus-stack
repo: https://prometheus-community.github.io/helm-charts
targetNamespace: monitoring
valuesContent: |-
# Alertmanager off for now (no notification channel configured yet).
alertmanager:
enabled: false
grafana:
adminPassword: "roysland-grafana-changeme"
defaultDashboardsTimezone: Europe/Oslo
service:
type: ClusterIP
# Grafana 13 first-boot migrations are slow on the local-path disk; be
# tolerant so the liveness probe doesn't kill it mid-migration.
livenessProbe:
initialDelaySeconds: 120
timeoutSeconds: 30
failureThreshold: 30
persistence:
enabled: true
storageClassName: local-path
size: 2Gi
# Sidecar loads any ConfigMap labelled grafana_dashboard across namespaces
# (used by the custom Temperatures dashboard).
sidecar:
dashboards:
enabled: true
searchNamespace: ALL
# Auto-provisioned community dashboards (downloaded from grafana.com on start).
dashboardProviders:
dashboardproviders.yaml:
apiVersion: 1
providers:
- name: default
orgId: 1
folder: ""
type: file
disableDeletion: false
editable: true
options:
path: /var/lib/grafana/dashboards/default
dashboards:
default:
node-exporter-full:
gnetId: 1860 # Node Exporter Full (CPU/RAM/disk/net + temps), pick host via $instance
revision: 45
datasource: Prometheus
proxmox:
gnetId: 10347 # Proxmox via Prometheus (pve-exporter)
revision: 5
datasource: Prometheus
# Node metrics for the k3s host itself.
nodeExporter:
enabled: true
kube-state-metrics:
enabled: true
prometheus:
prometheusSpec:
retention: 15d
scrapeInterval: 30s
# Persist metrics on the local-path PVC.
storageSpec:
volumeClaimTemplate:
spec:
storageClassName: local-path
accessModes: ["ReadWriteOnce"]
resources:
requests:
storage: 10Gi
# Pick up ServiceMonitors/PodMonitors from any namespace, not just chart-labelled ones.
serviceMonitorSelectorNilUsesHelmValues: false
podMonitorSelectorNilUsesHelmValues: false
# External targets: Proxmox host + the other VMs + test server.
additionalScrapeConfigs:
# node_exporter on the Proxmox host and the other servers (installed via ansible).
- job_name: node-external
static_configs:
- targets:
- 192.168.50.48:9100 # proxmox host (master)
- 192.168.50.70:9100 # minio
- 192.168.50.71:9100 # forgejo
- 192.168.50.49:9100 # test server
labels:
group: infra-hosts
# Proxmox VE stats via prometheus-pve-exporter (deployed in-cluster).
- job_name: proxmox-pve
metrics_path: /pve
params:
module: [default]
static_configs:
- targets:
- 192.168.50.48 # the PVE node to query
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: prometheus-pve-exporter.monitoring.svc.cluster.local:9221
# ---
# To expose Grafana later at grafana.roysland.net, set grafana.ingress in the
# values above:
# grafana:
# ingress:
# enabled: true
# ingressClassName: traefik
# annotations:
# cert-manager.io/cluster-issuer: letsencrypt
# traefik.ingress.kubernetes.io/router.middlewares: monitoring-https-redirect@kubernetescrd
# hosts: [grafana.roysland.net]
# tls:
# - hosts: [grafana.roysland.net]
# secretName: grafana-tls
# (and add a Middleware named https-redirect in the monitoring namespace)