apiVersion: v1 kind: Namespace metadata: name: monitoring --- # kube-prometheus-stack: Prometheus + Grafana + node-exporter + kube-state-metrics. # Deployed via k3s helm-controller. Grafana is internal-only (ClusterIP) for now; # an ingress stub is commented at the bottom for when you want grafana.roysland.net. apiVersion: helm.cattle.io/v1 kind: HelmChart metadata: name: kube-prometheus-stack namespace: kube-system spec: chart: kube-prometheus-stack repo: https://prometheus-community.github.io/helm-charts targetNamespace: monitoring valuesContent: |- # Alertmanager off for now (no notification channel configured yet). alertmanager: enabled: false grafana: adminPassword: "roysland-grafana-changeme" defaultDashboardsTimezone: Europe/Oslo service: type: ClusterIP # Grafana 13 first-boot migrations are slow on the local-path disk; be # tolerant so the liveness probe doesn't kill it mid-migration. livenessProbe: initialDelaySeconds: 120 timeoutSeconds: 30 failureThreshold: 30 persistence: enabled: true storageClassName: local-path size: 2Gi # Sidecar loads any ConfigMap labelled grafana_dashboard across namespaces # (used by the custom Temperatures dashboard). sidecar: dashboards: enabled: true searchNamespace: ALL # Auto-provisioned community dashboards (downloaded from grafana.com on start). dashboardProviders: dashboardproviders.yaml: apiVersion: 1 providers: - name: default orgId: 1 folder: "" type: file disableDeletion: false editable: true options: path: /var/lib/grafana/dashboards/default dashboards: default: node-exporter-full: gnetId: 1860 # Node Exporter Full (CPU/RAM/disk/net + temps), pick host via $instance revision: 45 datasource: Prometheus proxmox: gnetId: 10347 # Proxmox via Prometheus (pve-exporter) revision: 5 datasource: Prometheus # Node metrics for the k3s host itself. nodeExporter: enabled: true kube-state-metrics: enabled: true prometheus: prometheusSpec: retention: 15d scrapeInterval: 30s # Persist metrics on the local-path PVC. storageSpec: volumeClaimTemplate: spec: storageClassName: local-path accessModes: ["ReadWriteOnce"] resources: requests: storage: 10Gi # Pick up ServiceMonitors/PodMonitors from any namespace, not just chart-labelled ones. serviceMonitorSelectorNilUsesHelmValues: false podMonitorSelectorNilUsesHelmValues: false # External targets: Proxmox host + the other VMs + test server. additionalScrapeConfigs: # node_exporter on the Proxmox host and the other servers (installed via ansible). - job_name: node-external static_configs: - targets: - 192.168.50.48:9100 # proxmox host (master) - 192.168.50.70:9100 # minio - 192.168.50.71:9100 # forgejo - 192.168.50.49:9100 # test server labels: group: infra-hosts # Proxmox VE stats via prometheus-pve-exporter (deployed in-cluster). - job_name: proxmox-pve metrics_path: /pve params: module: [default] static_configs: - targets: - 192.168.50.48 # the PVE node to query relabel_configs: - source_labels: [__address__] target_label: __param_target - source_labels: [__param_target] target_label: instance - target_label: __address__ replacement: prometheus-pve-exporter.monitoring.svc.cluster.local:9221 # --- # To expose Grafana later at grafana.roysland.net, set grafana.ingress in the # values above: # grafana: # ingress: # enabled: true # ingressClassName: traefik # annotations: # cert-manager.io/cluster-issuer: letsencrypt # traefik.ingress.kubernetes.io/router.middlewares: monitoring-https-redirect@kubernetescrd # hosts: [grafana.roysland.net] # tls: # - hosts: [grafana.roysland.net] # secretName: grafana-tls # (and add a Middleware named https-redirect in the monitoring namespace)