k3s/monitoring/monitoring.yaml

apiVersion: v1
kind: Namespace
metadata:
  name: monitoring
---
# kube-prometheus-stack: Prometheus + Grafana + node-exporter + kube-state-metrics.
# Deployed via k3s helm-controller. Grafana is internal-only (ClusterIP) for now;
# an ingress stub is commented at the bottom for when you want grafana.roysland.net.
apiVersion: helm.cattle.io/v1
kind: HelmChart
metadata:
  name: kube-prometheus-stack
  namespace: kube-system
spec:
  chart: kube-prometheus-stack
  repo: https://prometheus-community.github.io/helm-charts
  targetNamespace: monitoring
  valuesContent: |-
    # Alertmanager off for now (no notification channel configured yet).
    alertmanager:
      enabled: false

    grafana:
      adminPassword: "roysland-grafana-changeme"
      defaultDashboardsTimezone: Europe/Oslo
      service:
        type: ClusterIP
      # Grafana 13 first-boot migrations are slow on the local-path disk; be
      # tolerant so the liveness probe doesn't kill it mid-migration.
      livenessProbe:
        initialDelaySeconds: 120
        timeoutSeconds: 30
        failureThreshold: 30
      persistence:
        enabled: true
        storageClassName: local-path
        size: 2Gi
      # Sidecar loads any ConfigMap labelled grafana_dashboard across namespaces
      # (used by the custom Temperatures dashboard).
      sidecar:
        dashboards:
          enabled: true
          searchNamespace: ALL
      # Auto-provisioned community dashboards (downloaded from grafana.com on start).
      dashboardProviders:
        dashboardproviders.yaml:
          apiVersion: 1
          providers:
            - name: default
              orgId: 1
              folder: ""
              type: file
              disableDeletion: false
              editable: true
              options:
                path: /var/lib/grafana/dashboards/default
      dashboards:
        default:
          node-exporter-full:
            gnetId: 1860       # Node Exporter Full (CPU/RAM/disk/net + temps), pick host via $instance
            revision: 45
            datasource: Prometheus
          proxmox:
            gnetId: 10347      # Proxmox via Prometheus (pve-exporter)
            revision: 5
            datasource: Prometheus

    # Node metrics for the k3s host itself.
    nodeExporter:
      enabled: true
    kube-state-metrics:
      enabled: true

    prometheus:
      prometheusSpec:
        retention: 15d
        scrapeInterval: 30s
        # Persist metrics on the local-path PVC.
        storageSpec:
          volumeClaimTemplate:
            spec:
              storageClassName: local-path
              accessModes: ["ReadWriteOnce"]
              resources:
                requests:
                  storage: 10Gi
        # Pick up ServiceMonitors/PodMonitors from any namespace, not just chart-labelled ones.
        serviceMonitorSelectorNilUsesHelmValues: false
        podMonitorSelectorNilUsesHelmValues: false
        # External targets: Proxmox host + the other VMs + test server.
        additionalScrapeConfigs:
          # node_exporter on the Proxmox host and the other servers (installed via ansible).
          - job_name: node-external
            static_configs:
              - targets:
                  - 192.168.50.48:9100   # proxmox host (master)
                  - 192.168.50.70:9100   # minio
                  - 192.168.50.71:9100   # forgejo
                  - 192.168.50.49:9100   # test server
                labels:
                  group: infra-hosts
          # Proxmox VE stats via prometheus-pve-exporter (deployed in-cluster).
          - job_name: proxmox-pve
            metrics_path: /pve
            params:
              module: [default]
            static_configs:
              - targets:
                  - 192.168.50.48        # the PVE node to query
            relabel_configs:
              - source_labels: [__address__]
                target_label: __param_target
              - source_labels: [__param_target]
                target_label: instance
              - target_label: __address__
                replacement: prometheus-pve-exporter.monitoring.svc.cluster.local:9221
# ---
# To expose Grafana later at grafana.roysland.net, set grafana.ingress in the
# values above:
#   grafana:
#     ingress:
#       enabled: true
#       ingressClassName: traefik
#       annotations:
#         cert-manager.io/cluster-issuer: letsencrypt
#         traefik.ingress.kubernetes.io/router.middlewares: monitoring-https-redirect@kubernetescrd
#       hosts: [grafana.roysland.net]
#       tls:
#         - hosts: [grafana.roysland.net]
#           secretName: grafana-tls
#   (and add a Middleware named https-redirect in the monitoring namespace)