k3s/monitoring/monitoring.yaml
teddy 7ad8fd479f monitoring: expose Grafana at grafana.roysland.net (LE TLS + HTTPS redirect)
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-09 23:51:42 +02:00

144 lines
4.9 KiB
YAML

apiVersion: v1
kind: Namespace
metadata:
name: monitoring
---
# Force HTTP->HTTPS at Traefik for the Grafana ingress (same pattern as headlamp/passbolt).
apiVersion: traefik.io/v1alpha1
kind: Middleware
metadata:
name: https-redirect
namespace: monitoring
spec:
redirectScheme:
scheme: https
permanent: true
---
# kube-prometheus-stack: Prometheus + Grafana + node-exporter + kube-state-metrics.
# Deployed via k3s helm-controller. Grafana is exposed at grafana.roysland.net
# (Let's Encrypt TLS via cert-manager, resolved by the *.roysland.net wildcard).
apiVersion: helm.cattle.io/v1
kind: HelmChart
metadata:
name: kube-prometheus-stack
namespace: kube-system
spec:
chart: kube-prometheus-stack
repo: https://prometheus-community.github.io/helm-charts
targetNamespace: monitoring
valuesContent: |-
# Alertmanager off for now (no notification channel configured yet).
alertmanager:
enabled: false
grafana:
adminPassword: "roysland-grafana-changeme"
defaultDashboardsTimezone: Europe/Oslo
service:
type: ClusterIP
# Reachable in the browser at grafana.roysland.net with a Let's Encrypt cert.
ingress:
enabled: true
ingressClassName: traefik
annotations:
cert-manager.io/cluster-issuer: letsencrypt
traefik.ingress.kubernetes.io/router.middlewares: monitoring-https-redirect@kubernetescrd
hosts:
- grafana.roysland.net
tls:
- hosts:
- grafana.roysland.net
secretName: grafana-tls
# Tell Grafana its external URL so redirects/assets work behind the proxy.
grafana.ini:
server:
root_url: https://grafana.roysland.net
# Grafana 13 first-boot migrations are slow on the local-path disk; be
# tolerant so the liveness probe doesn't kill it mid-migration.
livenessProbe:
initialDelaySeconds: 120
timeoutSeconds: 30
failureThreshold: 30
persistence:
enabled: true
storageClassName: local-path
size: 2Gi
# Sidecar loads any ConfigMap labelled grafana_dashboard across namespaces
# (used by the custom Temperatures dashboard).
sidecar:
dashboards:
enabled: true
searchNamespace: ALL
# Auto-provisioned community dashboards (downloaded from grafana.com on start).
dashboardProviders:
dashboardproviders.yaml:
apiVersion: 1
providers:
- name: default
orgId: 1
folder: ""
type: file
disableDeletion: false
editable: true
options:
path: /var/lib/grafana/dashboards/default
dashboards:
default:
node-exporter-full:
gnetId: 1860 # Node Exporter Full (CPU/RAM/disk/net + temps), pick host via $instance
revision: 45
datasource: Prometheus
proxmox:
gnetId: 10347 # Proxmox via Prometheus (pve-exporter)
revision: 5
datasource: Prometheus
# Node metrics for the k3s host itself.
nodeExporter:
enabled: true
kube-state-metrics:
enabled: true
prometheus:
prometheusSpec:
retention: 15d
scrapeInterval: 30s
# Persist metrics on the local-path PVC.
storageSpec:
volumeClaimTemplate:
spec:
storageClassName: local-path
accessModes: ["ReadWriteOnce"]
resources:
requests:
storage: 10Gi
# Pick up ServiceMonitors/PodMonitors from any namespace, not just chart-labelled ones.
serviceMonitorSelectorNilUsesHelmValues: false
podMonitorSelectorNilUsesHelmValues: false
# External targets: Proxmox host + the other VMs + test server.
additionalScrapeConfigs:
# node_exporter on the Proxmox host and the other servers (installed via ansible).
- job_name: node-external
static_configs:
- targets:
- 192.168.50.48:9100 # proxmox host (master)
- 192.168.50.70:9100 # minio
- 192.168.50.71:9100 # forgejo
- 192.168.50.49:9100 # test server
labels:
group: infra-hosts
# Proxmox VE stats via prometheus-pve-exporter (deployed in-cluster).
- job_name: proxmox-pve
metrics_path: /pve
params:
module: [default]
static_configs:
- targets:
- 192.168.50.48 # the PVE node to query
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: prometheus-pve-exporter.monitoring.svc.cluster.local:9221