144 lines
4.9 KiB
YAML
144 lines
4.9 KiB
YAML
apiVersion: v1
|
|
kind: Namespace
|
|
metadata:
|
|
name: monitoring
|
|
---
|
|
# Force HTTP->HTTPS at Traefik for the Grafana ingress (same pattern as headlamp/passbolt).
|
|
apiVersion: traefik.io/v1alpha1
|
|
kind: Middleware
|
|
metadata:
|
|
name: https-redirect
|
|
namespace: monitoring
|
|
spec:
|
|
redirectScheme:
|
|
scheme: https
|
|
permanent: true
|
|
---
|
|
# kube-prometheus-stack: Prometheus + Grafana + node-exporter + kube-state-metrics.
|
|
# Deployed via k3s helm-controller. Grafana is exposed at grafana.roysland.net
|
|
# (Let's Encrypt TLS via cert-manager, resolved by the *.roysland.net wildcard).
|
|
apiVersion: helm.cattle.io/v1
|
|
kind: HelmChart
|
|
metadata:
|
|
name: kube-prometheus-stack
|
|
namespace: kube-system
|
|
spec:
|
|
chart: kube-prometheus-stack
|
|
repo: https://prometheus-community.github.io/helm-charts
|
|
targetNamespace: monitoring
|
|
valuesContent: |-
|
|
# Alertmanager off for now (no notification channel configured yet).
|
|
alertmanager:
|
|
enabled: false
|
|
|
|
grafana:
|
|
adminPassword: "roysland-grafana-changeme"
|
|
defaultDashboardsTimezone: Europe/Oslo
|
|
service:
|
|
type: ClusterIP
|
|
# Reachable in the browser at grafana.roysland.net with a Let's Encrypt cert.
|
|
ingress:
|
|
enabled: true
|
|
ingressClassName: traefik
|
|
annotations:
|
|
cert-manager.io/cluster-issuer: letsencrypt
|
|
traefik.ingress.kubernetes.io/router.middlewares: monitoring-https-redirect@kubernetescrd
|
|
hosts:
|
|
- grafana.roysland.net
|
|
tls:
|
|
- hosts:
|
|
- grafana.roysland.net
|
|
secretName: grafana-tls
|
|
# Tell Grafana its external URL so redirects/assets work behind the proxy.
|
|
grafana.ini:
|
|
server:
|
|
root_url: https://grafana.roysland.net
|
|
# Grafana 13 first-boot migrations are slow on the local-path disk; be
|
|
# tolerant so the liveness probe doesn't kill it mid-migration.
|
|
livenessProbe:
|
|
initialDelaySeconds: 120
|
|
timeoutSeconds: 30
|
|
failureThreshold: 30
|
|
persistence:
|
|
enabled: true
|
|
storageClassName: local-path
|
|
size: 2Gi
|
|
# Sidecar loads any ConfigMap labelled grafana_dashboard across namespaces
|
|
# (used by the custom Temperatures dashboard).
|
|
sidecar:
|
|
dashboards:
|
|
enabled: true
|
|
searchNamespace: ALL
|
|
# Auto-provisioned community dashboards (downloaded from grafana.com on start).
|
|
dashboardProviders:
|
|
dashboardproviders.yaml:
|
|
apiVersion: 1
|
|
providers:
|
|
- name: default
|
|
orgId: 1
|
|
folder: ""
|
|
type: file
|
|
disableDeletion: false
|
|
editable: true
|
|
options:
|
|
path: /var/lib/grafana/dashboards/default
|
|
dashboards:
|
|
default:
|
|
node-exporter-full:
|
|
gnetId: 1860 # Node Exporter Full (CPU/RAM/disk/net + temps), pick host via $instance
|
|
revision: 45
|
|
datasource: Prometheus
|
|
proxmox:
|
|
gnetId: 10347 # Proxmox via Prometheus (pve-exporter)
|
|
revision: 5
|
|
datasource: Prometheus
|
|
|
|
# Node metrics for the k3s host itself.
|
|
nodeExporter:
|
|
enabled: true
|
|
kube-state-metrics:
|
|
enabled: true
|
|
|
|
prometheus:
|
|
prometheusSpec:
|
|
retention: 15d
|
|
scrapeInterval: 30s
|
|
# Persist metrics on the local-path PVC.
|
|
storageSpec:
|
|
volumeClaimTemplate:
|
|
spec:
|
|
storageClassName: local-path
|
|
accessModes: ["ReadWriteOnce"]
|
|
resources:
|
|
requests:
|
|
storage: 10Gi
|
|
# Pick up ServiceMonitors/PodMonitors from any namespace, not just chart-labelled ones.
|
|
serviceMonitorSelectorNilUsesHelmValues: false
|
|
podMonitorSelectorNilUsesHelmValues: false
|
|
# External targets: Proxmox host + the other VMs + test server.
|
|
additionalScrapeConfigs:
|
|
# node_exporter on the Proxmox host and the other servers (installed via ansible).
|
|
- job_name: node-external
|
|
static_configs:
|
|
- targets:
|
|
- 192.168.50.48:9100 # proxmox host (master)
|
|
- 192.168.50.70:9100 # minio
|
|
- 192.168.50.71:9100 # forgejo
|
|
- 192.168.50.49:9100 # test server
|
|
labels:
|
|
group: infra-hosts
|
|
# Proxmox VE stats via prometheus-pve-exporter (deployed in-cluster).
|
|
- job_name: proxmox-pve
|
|
metrics_path: /pve
|
|
params:
|
|
module: [default]
|
|
static_configs:
|
|
- targets:
|
|
- 192.168.50.48 # the PVE node to query
|
|
relabel_configs:
|
|
- source_labels: [__address__]
|
|
target_label: __param_target
|
|
- source_labels: [__param_target]
|
|
target_label: instance
|
|
- target_label: __address__
|
|
replacement: prometheus-pve-exporter.monitoring.svc.cluster.local:9221
|