apiVersion: v1
kind: Namespace
metadata:
  name: monitoring
---
# Force HTTP->HTTPS at Traefik for the Grafana ingress (same pattern as headlamp/passbolt).
apiVersion: traefik.io/v1alpha1
kind: Middleware
metadata:
  name: https-redirect
  namespace: monitoring
spec:
  redirectScheme:
    scheme: https
    permanent: true
---
# kube-prometheus-stack: Prometheus + Grafana + node-exporter + kube-state-metrics.
# Deployed via k3s helm-controller. Grafana is exposed at grafana.roysland.net
# (Let's Encrypt TLS via cert-manager, resolved by the *.roysland.net wildcard).
apiVersion: helm.cattle.io/v1
kind: HelmChart
metadata:
  name: kube-prometheus-stack
  namespace: kube-system
spec:
  chart: kube-prometheus-stack
  repo: https://prometheus-community.github.io/helm-charts
  targetNamespace: monitoring
  valuesContent: |-
    # Alertmanager off for now (no notification channel configured yet).
    alertmanager:
      enabled: false

    grafana:
      adminPassword: "roysland-grafana-changeme"
      defaultDashboardsTimezone: Europe/Oslo
      service:
        type: ClusterIP
      # Reachable in the browser at grafana.roysland.net with a Let's Encrypt cert.
      ingress:
        enabled: true
        ingressClassName: traefik
        annotations:
          cert-manager.io/cluster-issuer: letsencrypt
          traefik.ingress.kubernetes.io/router.middlewares: monitoring-https-redirect@kubernetescrd
        hosts:
          - grafana.roysland.net
        tls:
          - hosts:
              - grafana.roysland.net
            secretName: grafana-tls
      # Tell Grafana its external URL so redirects/assets work behind the proxy.
      grafana.ini:
        server:
          root_url: https://grafana.roysland.net
      # Grafana 13 first-boot migrations are slow on the local-path disk; be
      # tolerant so the liveness probe doesn't kill it mid-migration.
      livenessProbe:
        initialDelaySeconds: 120
        timeoutSeconds: 30
        failureThreshold: 30
      persistence:
        enabled: true
        storageClassName: local-path
        size: 2Gi
      # Sidecar loads any ConfigMap labelled grafana_dashboard across namespaces
      # (used by the custom Temperatures dashboard).
      sidecar:
        dashboards:
          enabled: true
          searchNamespace: ALL
      # Auto-provisioned community dashboards (downloaded from grafana.com on start).
      dashboardProviders:
        dashboardproviders.yaml:
          apiVersion: 1
          providers:
            - name: default
              orgId: 1
              folder: ""
              type: file
              disableDeletion: false
              editable: true
              options:
                path: /var/lib/grafana/dashboards/default
      dashboards:
        default:
          node-exporter-full:
            gnetId: 1860       # Node Exporter Full (CPU/RAM/disk/net + temps), pick host via $instance
            revision: 45
            datasource: Prometheus
          proxmox:
            gnetId: 10347      # Proxmox via Prometheus (pve-exporter)
            revision: 5
            datasource: Prometheus

    # Node metrics for the k3s host itself.
    nodeExporter:
      enabled: true
    kube-state-metrics:
      enabled: true

    prometheus:
      prometheusSpec:
        retention: 15d
        scrapeInterval: 30s
        # Persist metrics on the local-path PVC.
        storageSpec:
          volumeClaimTemplate:
            spec:
              storageClassName: local-path
              accessModes: ["ReadWriteOnce"]
              resources:
                requests:
                  storage: 10Gi
        # Pick up ServiceMonitors/PodMonitors from any namespace, not just chart-labelled ones.
        serviceMonitorSelectorNilUsesHelmValues: false
        podMonitorSelectorNilUsesHelmValues: false
        # External targets: Proxmox host + the other VMs + test server.
        additionalScrapeConfigs:
          # node_exporter on the Proxmox host and the other servers (installed via ansible).
          - job_name: node-external
            static_configs:
              - targets:
                  - 192.168.50.48:9100   # proxmox host (master)
                  - 192.168.50.70:9100   # minio
                  - 192.168.50.71:9100   # forgejo
                  - 192.168.50.49:9100   # test server
                labels:
                  group: infra-hosts
          # Proxmox VE stats via prometheus-pve-exporter (deployed in-cluster).
          - job_name: proxmox-pve
            metrics_path: /pve
            params:
              module: [default]
            static_configs:
              - targets:
                  - 192.168.50.48        # the PVE node to query
            relabel_configs:
              - source_labels: [__address__]
                target_label: __param_target
              - source_labels: [__param_target]
                target_label: instance
              - target_label: __address__
                replacement: prometheus-pve-exporter.monitoring.svc.cluster.local:9221