monitoring: add Prometheus + Grafana stack (kube-prometheus-stack)

Deploys kube-prometheus-stack via helm-controller: Prometheus, Grafana (internal-only), node-exporter, kube-state-metrics. Adds prometheus-pve-exporter for Proxmox (token in an out-of-band Secret), scrape configs for external hosts (.48/.70/.71/.49), community dashboards (Node Exporter Full, Proxmox via Prometheus) and a custom Hardware Temperatures dashboard. Grafana liveness made tolerant for slow first-boot migrations.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
teddy 2026-06-09 23:45:02 +02:00
parent a064f41250
commit 0a02da49a4
3 changed files with 275 additions and 0 deletions

View file

@ -0,0 +1,91 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-dashboard-temps
namespace: monitoring
labels:
grafana_dashboard: "1"
data:
hardware-temperatures.json: |
{
"annotations": {"list": []},
"editable": true,
"graphTooltip": 0,
"links": [],
"panels": [
{
"datasource": {"type": "prometheus", "uid": "${datasource}"},
"fieldConfig": {
"defaults": {
"color": {"mode": "thresholds"},
"unit": "celsius",
"thresholds": {"mode": "absolute", "steps": [
{"color": "green", "value": null},
{"color": "yellow", "value": 60},
{"color": "orange", "value": 75},
{"color": "red", "value": 85}
]}
},
"overrides": []
},
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 0},
"id": 1,
"options": {
"colorMode": "background",
"graphMode": "area",
"orientation": "auto",
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false},
"textMode": "auto"
},
"pluginVersion": "11.0.0",
"title": "Max temperature per host (now)",
"type": "stat",
"targets": [
{"datasource": {"type": "prometheus", "uid": "${datasource}"},
"expr": "max by (instance) (node_hwmon_temp_celsius)",
"legendFormat": "{{instance}}", "refId": "A"}
]
},
{
"datasource": {"type": "prometheus", "uid": "${datasource}"},
"fieldConfig": {
"defaults": {
"color": {"mode": "palette-classic"},
"unit": "celsius",
"custom": {"drawStyle": "line", "lineWidth": 1, "fillOpacity": 10, "showPoints": "never"},
"thresholds": {"mode": "absolute", "steps": [
{"color": "green", "value": null},
{"color": "red", "value": 85}
]}
},
"overrides": []
},
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 0},
"id": 2,
"options": {
"legend": {"displayMode": "table", "placement": "bottom", "calcs": ["last", "max"]},
"tooltip": {"mode": "multi"}
},
"title": "All temperature sensors",
"type": "timeseries",
"targets": [
{"datasource": {"type": "prometheus", "uid": "${datasource}"},
"expr": "node_hwmon_temp_celsius",
"legendFormat": "{{instance}} / {{chip}} / {{sensor}}", "refId": "A"}
]
}
],
"refresh": "30s",
"schemaVersion": 39,
"tags": ["hardware", "temperature"],
"templating": {"list": [
{"current": {}, "hide": 0, "includeAll": false, "label": "Datasource",
"multi": false, "name": "datasource", "options": [], "query": "prometheus",
"refresh": 1, "type": "datasource"}
]},
"time": {"from": "now-6h", "to": "now"},
"timezone": "Europe/Oslo",
"title": "Hardware Temperatures",
"uid": "hw-temps",
"version": 1
}

131
monitoring/monitoring.yaml Normal file
View file

@ -0,0 +1,131 @@
apiVersion: v1
kind: Namespace
metadata:
name: monitoring
---
# kube-prometheus-stack: Prometheus + Grafana + node-exporter + kube-state-metrics.
# Deployed via k3s helm-controller. Grafana is internal-only (ClusterIP) for now;
# an ingress stub is commented at the bottom for when you want grafana.roysland.net.
apiVersion: helm.cattle.io/v1
kind: HelmChart
metadata:
name: kube-prometheus-stack
namespace: kube-system
spec:
chart: kube-prometheus-stack
repo: https://prometheus-community.github.io/helm-charts
targetNamespace: monitoring
valuesContent: |-
# Alertmanager off for now (no notification channel configured yet).
alertmanager:
enabled: false
grafana:
adminPassword: "roysland-grafana-changeme"
defaultDashboardsTimezone: Europe/Oslo
service:
type: ClusterIP
# Grafana 13 first-boot migrations are slow on the local-path disk; be
# tolerant so the liveness probe doesn't kill it mid-migration.
livenessProbe:
initialDelaySeconds: 120
timeoutSeconds: 30
failureThreshold: 30
persistence:
enabled: true
storageClassName: local-path
size: 2Gi
# Sidecar loads any ConfigMap labelled grafana_dashboard across namespaces
# (used by the custom Temperatures dashboard).
sidecar:
dashboards:
enabled: true
searchNamespace: ALL
# Auto-provisioned community dashboards (downloaded from grafana.com on start).
dashboardProviders:
dashboardproviders.yaml:
apiVersion: 1
providers:
- name: default
orgId: 1
folder: ""
type: file
disableDeletion: false
editable: true
options:
path: /var/lib/grafana/dashboards/default
dashboards:
default:
node-exporter-full:
gnetId: 1860 # Node Exporter Full (CPU/RAM/disk/net + temps), pick host via $instance
revision: 45
datasource: Prometheus
proxmox:
gnetId: 10347 # Proxmox via Prometheus (pve-exporter)
revision: 5
datasource: Prometheus
# Node metrics for the k3s host itself.
nodeExporter:
enabled: true
kube-state-metrics:
enabled: true
prometheus:
prometheusSpec:
retention: 15d
scrapeInterval: 30s
# Persist metrics on the local-path PVC.
storageSpec:
volumeClaimTemplate:
spec:
storageClassName: local-path
accessModes: ["ReadWriteOnce"]
resources:
requests:
storage: 10Gi
# Pick up ServiceMonitors/PodMonitors from any namespace, not just chart-labelled ones.
serviceMonitorSelectorNilUsesHelmValues: false
podMonitorSelectorNilUsesHelmValues: false
# External targets: Proxmox host + the other VMs + test server.
additionalScrapeConfigs:
# node_exporter on the Proxmox host and the other servers (installed via ansible).
- job_name: node-external
static_configs:
- targets:
- 192.168.50.48:9100 # proxmox host (master)
- 192.168.50.70:9100 # minio
- 192.168.50.71:9100 # forgejo
- 192.168.50.49:9100 # test server
labels:
group: infra-hosts
# Proxmox VE stats via prometheus-pve-exporter (deployed in-cluster).
- job_name: proxmox-pve
metrics_path: /pve
params:
module: [default]
static_configs:
- targets:
- 192.168.50.48 # the PVE node to query
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: prometheus-pve-exporter.monitoring.svc.cluster.local:9221
# ---
# To expose Grafana later at grafana.roysland.net, set grafana.ingress in the
# values above:
# grafana:
# ingress:
# enabled: true
# ingressClassName: traefik
# annotations:
# cert-manager.io/cluster-issuer: letsencrypt
# traefik.ingress.kubernetes.io/router.middlewares: monitoring-https-redirect@kubernetescrd
# hosts: [grafana.roysland.net]
# tls:
# - hosts: [grafana.roysland.net]
# secretName: grafana-tls
# (and add a Middleware named https-redirect in the monitoring namespace)

View file

@ -0,0 +1,53 @@
# prometheus-pve-exporter — exposes Proxmox VE metrics for Prometheus.
# The API token lives in the `pve-exporter` Secret, created out-of-band (NOT in git):
# kubectl -n monitoring create secret generic pve-exporter \
# --from-literal=PVE_USER=monitoring@pve \
# --from-literal=PVE_TOKEN_NAME=prometheus \
# --from-literal=PVE_TOKEN_VALUE=<token> \
# --from-literal=PVE_VERIFY_SSL=false
apiVersion: apps/v1
kind: Deployment
metadata:
name: prometheus-pve-exporter
namespace: monitoring
labels:
app: prometheus-pve-exporter
spec:
replicas: 1
selector:
matchLabels:
app: prometheus-pve-exporter
template:
metadata:
labels:
app: prometheus-pve-exporter
spec:
containers:
- name: pve-exporter
image: prompve/prometheus-pve-exporter:3.9.0
envFrom:
- secretRef:
name: pve-exporter
ports:
- containerPort: 9221
name: http
securityContext:
runAsNonRoot: true
runAsUser: 1000
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
seccompProfile:
type: RuntimeDefault
---
apiVersion: v1
kind: Service
metadata:
name: prometheus-pve-exporter
namespace: monitoring
spec:
selector:
app: prometheus-pve-exporter
ports:
- port: 9221
targetPort: 9221