monitoring: add Prometheus + Grafana stack (kube-prometheus-stack)
Deploys kube-prometheus-stack via helm-controller: Prometheus, Grafana (internal-only), node-exporter, kube-state-metrics. Adds prometheus-pve-exporter for Proxmox (token in an out-of-band Secret), scrape configs for external hosts (.48/.70/.71/.49), community dashboards (Node Exporter Full, Proxmox via Prometheus) and a custom Hardware Temperatures dashboard. Grafana liveness made tolerant for slow first-boot migrations. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
a064f41250
commit
0a02da49a4
3 changed files with 275 additions and 0 deletions
91
monitoring/grafana-temps-dashboard.yaml
Normal file
91
monitoring/grafana-temps-dashboard.yaml
Normal file
|
|
@ -0,0 +1,91 @@
|
||||||
|
apiVersion: v1
|
||||||
|
kind: ConfigMap
|
||||||
|
metadata:
|
||||||
|
name: grafana-dashboard-temps
|
||||||
|
namespace: monitoring
|
||||||
|
labels:
|
||||||
|
grafana_dashboard: "1"
|
||||||
|
data:
|
||||||
|
hardware-temperatures.json: |
|
||||||
|
{
|
||||||
|
"annotations": {"list": []},
|
||||||
|
"editable": true,
|
||||||
|
"graphTooltip": 0,
|
||||||
|
"links": [],
|
||||||
|
"panels": [
|
||||||
|
{
|
||||||
|
"datasource": {"type": "prometheus", "uid": "${datasource}"},
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": {"mode": "thresholds"},
|
||||||
|
"unit": "celsius",
|
||||||
|
"thresholds": {"mode": "absolute", "steps": [
|
||||||
|
{"color": "green", "value": null},
|
||||||
|
{"color": "yellow", "value": 60},
|
||||||
|
{"color": "orange", "value": 75},
|
||||||
|
{"color": "red", "value": 85}
|
||||||
|
]}
|
||||||
|
},
|
||||||
|
"overrides": []
|
||||||
|
},
|
||||||
|
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 0},
|
||||||
|
"id": 1,
|
||||||
|
"options": {
|
||||||
|
"colorMode": "background",
|
||||||
|
"graphMode": "area",
|
||||||
|
"orientation": "auto",
|
||||||
|
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false},
|
||||||
|
"textMode": "auto"
|
||||||
|
},
|
||||||
|
"pluginVersion": "11.0.0",
|
||||||
|
"title": "Max temperature per host (now)",
|
||||||
|
"type": "stat",
|
||||||
|
"targets": [
|
||||||
|
{"datasource": {"type": "prometheus", "uid": "${datasource}"},
|
||||||
|
"expr": "max by (instance) (node_hwmon_temp_celsius)",
|
||||||
|
"legendFormat": "{{instance}}", "refId": "A"}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"datasource": {"type": "prometheus", "uid": "${datasource}"},
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": {"mode": "palette-classic"},
|
||||||
|
"unit": "celsius",
|
||||||
|
"custom": {"drawStyle": "line", "lineWidth": 1, "fillOpacity": 10, "showPoints": "never"},
|
||||||
|
"thresholds": {"mode": "absolute", "steps": [
|
||||||
|
{"color": "green", "value": null},
|
||||||
|
{"color": "red", "value": 85}
|
||||||
|
]}
|
||||||
|
},
|
||||||
|
"overrides": []
|
||||||
|
},
|
||||||
|
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 0},
|
||||||
|
"id": 2,
|
||||||
|
"options": {
|
||||||
|
"legend": {"displayMode": "table", "placement": "bottom", "calcs": ["last", "max"]},
|
||||||
|
"tooltip": {"mode": "multi"}
|
||||||
|
},
|
||||||
|
"title": "All temperature sensors",
|
||||||
|
"type": "timeseries",
|
||||||
|
"targets": [
|
||||||
|
{"datasource": {"type": "prometheus", "uid": "${datasource}"},
|
||||||
|
"expr": "node_hwmon_temp_celsius",
|
||||||
|
"legendFormat": "{{instance}} / {{chip}} / {{sensor}}", "refId": "A"}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"refresh": "30s",
|
||||||
|
"schemaVersion": 39,
|
||||||
|
"tags": ["hardware", "temperature"],
|
||||||
|
"templating": {"list": [
|
||||||
|
{"current": {}, "hide": 0, "includeAll": false, "label": "Datasource",
|
||||||
|
"multi": false, "name": "datasource", "options": [], "query": "prometheus",
|
||||||
|
"refresh": 1, "type": "datasource"}
|
||||||
|
]},
|
||||||
|
"time": {"from": "now-6h", "to": "now"},
|
||||||
|
"timezone": "Europe/Oslo",
|
||||||
|
"title": "Hardware Temperatures",
|
||||||
|
"uid": "hw-temps",
|
||||||
|
"version": 1
|
||||||
|
}
|
||||||
131
monitoring/monitoring.yaml
Normal file
131
monitoring/monitoring.yaml
Normal file
|
|
@ -0,0 +1,131 @@
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Namespace
|
||||||
|
metadata:
|
||||||
|
name: monitoring
|
||||||
|
---
|
||||||
|
# kube-prometheus-stack: Prometheus + Grafana + node-exporter + kube-state-metrics.
|
||||||
|
# Deployed via k3s helm-controller. Grafana is internal-only (ClusterIP) for now;
|
||||||
|
# an ingress stub is commented at the bottom for when you want grafana.roysland.net.
|
||||||
|
apiVersion: helm.cattle.io/v1
|
||||||
|
kind: HelmChart
|
||||||
|
metadata:
|
||||||
|
name: kube-prometheus-stack
|
||||||
|
namespace: kube-system
|
||||||
|
spec:
|
||||||
|
chart: kube-prometheus-stack
|
||||||
|
repo: https://prometheus-community.github.io/helm-charts
|
||||||
|
targetNamespace: monitoring
|
||||||
|
valuesContent: |-
|
||||||
|
# Alertmanager off for now (no notification channel configured yet).
|
||||||
|
alertmanager:
|
||||||
|
enabled: false
|
||||||
|
|
||||||
|
grafana:
|
||||||
|
adminPassword: "roysland-grafana-changeme"
|
||||||
|
defaultDashboardsTimezone: Europe/Oslo
|
||||||
|
service:
|
||||||
|
type: ClusterIP
|
||||||
|
# Grafana 13 first-boot migrations are slow on the local-path disk; be
|
||||||
|
# tolerant so the liveness probe doesn't kill it mid-migration.
|
||||||
|
livenessProbe:
|
||||||
|
initialDelaySeconds: 120
|
||||||
|
timeoutSeconds: 30
|
||||||
|
failureThreshold: 30
|
||||||
|
persistence:
|
||||||
|
enabled: true
|
||||||
|
storageClassName: local-path
|
||||||
|
size: 2Gi
|
||||||
|
# Sidecar loads any ConfigMap labelled grafana_dashboard across namespaces
|
||||||
|
# (used by the custom Temperatures dashboard).
|
||||||
|
sidecar:
|
||||||
|
dashboards:
|
||||||
|
enabled: true
|
||||||
|
searchNamespace: ALL
|
||||||
|
# Auto-provisioned community dashboards (downloaded from grafana.com on start).
|
||||||
|
dashboardProviders:
|
||||||
|
dashboardproviders.yaml:
|
||||||
|
apiVersion: 1
|
||||||
|
providers:
|
||||||
|
- name: default
|
||||||
|
orgId: 1
|
||||||
|
folder: ""
|
||||||
|
type: file
|
||||||
|
disableDeletion: false
|
||||||
|
editable: true
|
||||||
|
options:
|
||||||
|
path: /var/lib/grafana/dashboards/default
|
||||||
|
dashboards:
|
||||||
|
default:
|
||||||
|
node-exporter-full:
|
||||||
|
gnetId: 1860 # Node Exporter Full (CPU/RAM/disk/net + temps), pick host via $instance
|
||||||
|
revision: 45
|
||||||
|
datasource: Prometheus
|
||||||
|
proxmox:
|
||||||
|
gnetId: 10347 # Proxmox via Prometheus (pve-exporter)
|
||||||
|
revision: 5
|
||||||
|
datasource: Prometheus
|
||||||
|
|
||||||
|
# Node metrics for the k3s host itself.
|
||||||
|
nodeExporter:
|
||||||
|
enabled: true
|
||||||
|
kube-state-metrics:
|
||||||
|
enabled: true
|
||||||
|
|
||||||
|
prometheus:
|
||||||
|
prometheusSpec:
|
||||||
|
retention: 15d
|
||||||
|
scrapeInterval: 30s
|
||||||
|
# Persist metrics on the local-path PVC.
|
||||||
|
storageSpec:
|
||||||
|
volumeClaimTemplate:
|
||||||
|
spec:
|
||||||
|
storageClassName: local-path
|
||||||
|
accessModes: ["ReadWriteOnce"]
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
storage: 10Gi
|
||||||
|
# Pick up ServiceMonitors/PodMonitors from any namespace, not just chart-labelled ones.
|
||||||
|
serviceMonitorSelectorNilUsesHelmValues: false
|
||||||
|
podMonitorSelectorNilUsesHelmValues: false
|
||||||
|
# External targets: Proxmox host + the other VMs + test server.
|
||||||
|
additionalScrapeConfigs:
|
||||||
|
# node_exporter on the Proxmox host and the other servers (installed via ansible).
|
||||||
|
- job_name: node-external
|
||||||
|
static_configs:
|
||||||
|
- targets:
|
||||||
|
- 192.168.50.48:9100 # proxmox host (master)
|
||||||
|
- 192.168.50.70:9100 # minio
|
||||||
|
- 192.168.50.71:9100 # forgejo
|
||||||
|
- 192.168.50.49:9100 # test server
|
||||||
|
labels:
|
||||||
|
group: infra-hosts
|
||||||
|
# Proxmox VE stats via prometheus-pve-exporter (deployed in-cluster).
|
||||||
|
- job_name: proxmox-pve
|
||||||
|
metrics_path: /pve
|
||||||
|
params:
|
||||||
|
module: [default]
|
||||||
|
static_configs:
|
||||||
|
- targets:
|
||||||
|
- 192.168.50.48 # the PVE node to query
|
||||||
|
relabel_configs:
|
||||||
|
- source_labels: [__address__]
|
||||||
|
target_label: __param_target
|
||||||
|
- source_labels: [__param_target]
|
||||||
|
target_label: instance
|
||||||
|
- target_label: __address__
|
||||||
|
replacement: prometheus-pve-exporter.monitoring.svc.cluster.local:9221
|
||||||
|
# ---
|
||||||
|
# To expose Grafana later at grafana.roysland.net, set grafana.ingress in the
|
||||||
|
# values above:
|
||||||
|
# grafana:
|
||||||
|
# ingress:
|
||||||
|
# enabled: true
|
||||||
|
# ingressClassName: traefik
|
||||||
|
# annotations:
|
||||||
|
# cert-manager.io/cluster-issuer: letsencrypt
|
||||||
|
# traefik.ingress.kubernetes.io/router.middlewares: monitoring-https-redirect@kubernetescrd
|
||||||
|
# hosts: [grafana.roysland.net]
|
||||||
|
# tls:
|
||||||
|
# - hosts: [grafana.roysland.net]
|
||||||
|
# secretName: grafana-tls
|
||||||
|
# (and add a Middleware named https-redirect in the monitoring namespace)
|
||||||
53
monitoring/pve-exporter.yaml
Normal file
53
monitoring/pve-exporter.yaml
Normal file
|
|
@ -0,0 +1,53 @@
|
||||||
|
# prometheus-pve-exporter — exposes Proxmox VE metrics for Prometheus.
|
||||||
|
# The API token lives in the `pve-exporter` Secret, created out-of-band (NOT in git):
|
||||||
|
# kubectl -n monitoring create secret generic pve-exporter \
|
||||||
|
# --from-literal=PVE_USER=monitoring@pve \
|
||||||
|
# --from-literal=PVE_TOKEN_NAME=prometheus \
|
||||||
|
# --from-literal=PVE_TOKEN_VALUE=<token> \
|
||||||
|
# --from-literal=PVE_VERIFY_SSL=false
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
|
metadata:
|
||||||
|
name: prometheus-pve-exporter
|
||||||
|
namespace: monitoring
|
||||||
|
labels:
|
||||||
|
app: prometheus-pve-exporter
|
||||||
|
spec:
|
||||||
|
replicas: 1
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
app: prometheus-pve-exporter
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app: prometheus-pve-exporter
|
||||||
|
spec:
|
||||||
|
containers:
|
||||||
|
- name: pve-exporter
|
||||||
|
image: prompve/prometheus-pve-exporter:3.9.0
|
||||||
|
envFrom:
|
||||||
|
- secretRef:
|
||||||
|
name: pve-exporter
|
||||||
|
ports:
|
||||||
|
- containerPort: 9221
|
||||||
|
name: http
|
||||||
|
securityContext:
|
||||||
|
runAsNonRoot: true
|
||||||
|
runAsUser: 1000
|
||||||
|
allowPrivilegeEscalation: false
|
||||||
|
capabilities:
|
||||||
|
drop: ["ALL"]
|
||||||
|
seccompProfile:
|
||||||
|
type: RuntimeDefault
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Service
|
||||||
|
metadata:
|
||||||
|
name: prometheus-pve-exporter
|
||||||
|
namespace: monitoring
|
||||||
|
spec:
|
||||||
|
selector:
|
||||||
|
app: prometheus-pve-exporter
|
||||||
|
ports:
|
||||||
|
- port: 9221
|
||||||
|
targetPort: 9221
|
||||||
Loading…
Add table
Add a link
Reference in a new issue