apiVersion: v1 kind: Namespace metadata: name: monitoring --- # Force HTTP->HTTPS at Traefik for the Grafana ingress (same pattern as headlamp/passbolt). apiVersion: traefik.io/v1alpha1 kind: Middleware metadata: name: https-redirect namespace: monitoring spec: redirectScheme: scheme: https permanent: true --- # kube-prometheus-stack: Prometheus + Grafana + node-exporter + kube-state-metrics. # Deployed via k3s helm-controller. Grafana is exposed at grafana.roysland.net # (Let's Encrypt TLS via cert-manager, resolved by the *.roysland.net wildcard). apiVersion: helm.cattle.io/v1 kind: HelmChart metadata: name: kube-prometheus-stack namespace: kube-system spec: chart: kube-prometheus-stack repo: https://prometheus-community.github.io/helm-charts targetNamespace: monitoring valuesContent: |- # Alertmanager off for now (no notification channel configured yet). alertmanager: enabled: false grafana: adminPassword: "roysland-grafana-changeme" defaultDashboardsTimezone: Europe/Oslo service: type: ClusterIP # Reachable in the browser at grafana.roysland.net with a Let's Encrypt cert. ingress: enabled: true ingressClassName: traefik annotations: cert-manager.io/cluster-issuer: letsencrypt traefik.ingress.kubernetes.io/router.middlewares: monitoring-https-redirect@kubernetescrd hosts: - grafana.roysland.net tls: - hosts: - grafana.roysland.net secretName: grafana-tls # Tell Grafana its external URL so redirects/assets work behind the proxy. grafana.ini: server: root_url: https://grafana.roysland.net # Grafana 13 first-boot migrations are slow on the local-path disk; be # tolerant so the liveness probe doesn't kill it mid-migration. livenessProbe: initialDelaySeconds: 120 timeoutSeconds: 30 failureThreshold: 30 persistence: enabled: true storageClassName: local-path size: 2Gi # Sidecar loads any ConfigMap labelled grafana_dashboard across namespaces # (used by the custom Temperatures dashboard). sidecar: dashboards: enabled: true searchNamespace: ALL # Auto-provisioned community dashboards (downloaded from grafana.com on start). dashboardProviders: dashboardproviders.yaml: apiVersion: 1 providers: - name: default orgId: 1 folder: "" type: file disableDeletion: false editable: true options: path: /var/lib/grafana/dashboards/default dashboards: default: node-exporter-full: gnetId: 1860 # Node Exporter Full (CPU/RAM/disk/net + temps), pick host via $instance revision: 45 datasource: Prometheus proxmox: gnetId: 10347 # Proxmox via Prometheus (pve-exporter) revision: 5 datasource: Prometheus # Node metrics for the k3s host itself. nodeExporter: enabled: true kube-state-metrics: enabled: true prometheus: prometheusSpec: retention: 15d scrapeInterval: 30s # Persist metrics on the local-path PVC. storageSpec: volumeClaimTemplate: spec: storageClassName: local-path accessModes: ["ReadWriteOnce"] resources: requests: storage: 10Gi # Pick up ServiceMonitors/PodMonitors from any namespace, not just chart-labelled ones. serviceMonitorSelectorNilUsesHelmValues: false podMonitorSelectorNilUsesHelmValues: false # External targets: Proxmox host + the other VMs + test server. additionalScrapeConfigs: # node_exporter on the Proxmox host and the other servers (installed via ansible). - job_name: node-external static_configs: - targets: - 192.168.50.48:9100 # proxmox host (master) - 192.168.50.70:9100 # minio - 192.168.50.71:9100 # forgejo - 192.168.50.49:9100 # test server labels: group: infra-hosts # Proxmox VE stats via prometheus-pve-exporter (deployed in-cluster). - job_name: proxmox-pve metrics_path: /pve params: module: [default] static_configs: - targets: - 192.168.50.48 # the PVE node to query relabel_configs: - source_labels: [__address__] target_label: __param_target - source_labels: [__param_target] target_label: instance - target_label: __address__ replacement: prometheus-pve-exporter.monitoring.svc.cluster.local:9221