monitoring: add Prometheus + Grafana stack (kube-prometheus-stack)
Deploys kube-prometheus-stack via helm-controller: Prometheus, Grafana (internal-only), node-exporter, kube-state-metrics. Adds prometheus-pve-exporter for Proxmox (token in an out-of-band Secret), scrape configs for external hosts (.48/.70/.71/.49), community dashboards (Node Exporter Full, Proxmox via Prometheus) and a custom Hardware Temperatures dashboard. Grafana liveness made tolerant for slow first-boot migrations. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
a064f41250
commit
0a02da49a4
3 changed files with 275 additions and 0 deletions
131
monitoring/monitoring.yaml
Normal file
131
monitoring/monitoring.yaml
Normal file
|
|
@ -0,0 +1,131 @@
|
|||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: monitoring
|
||||
---
|
||||
# kube-prometheus-stack: Prometheus + Grafana + node-exporter + kube-state-metrics.
|
||||
# Deployed via k3s helm-controller. Grafana is internal-only (ClusterIP) for now;
|
||||
# an ingress stub is commented at the bottom for when you want grafana.roysland.net.
|
||||
apiVersion: helm.cattle.io/v1
|
||||
kind: HelmChart
|
||||
metadata:
|
||||
name: kube-prometheus-stack
|
||||
namespace: kube-system
|
||||
spec:
|
||||
chart: kube-prometheus-stack
|
||||
repo: https://prometheus-community.github.io/helm-charts
|
||||
targetNamespace: monitoring
|
||||
valuesContent: |-
|
||||
# Alertmanager off for now (no notification channel configured yet).
|
||||
alertmanager:
|
||||
enabled: false
|
||||
|
||||
grafana:
|
||||
adminPassword: "roysland-grafana-changeme"
|
||||
defaultDashboardsTimezone: Europe/Oslo
|
||||
service:
|
||||
type: ClusterIP
|
||||
# Grafana 13 first-boot migrations are slow on the local-path disk; be
|
||||
# tolerant so the liveness probe doesn't kill it mid-migration.
|
||||
livenessProbe:
|
||||
initialDelaySeconds: 120
|
||||
timeoutSeconds: 30
|
||||
failureThreshold: 30
|
||||
persistence:
|
||||
enabled: true
|
||||
storageClassName: local-path
|
||||
size: 2Gi
|
||||
# Sidecar loads any ConfigMap labelled grafana_dashboard across namespaces
|
||||
# (used by the custom Temperatures dashboard).
|
||||
sidecar:
|
||||
dashboards:
|
||||
enabled: true
|
||||
searchNamespace: ALL
|
||||
# Auto-provisioned community dashboards (downloaded from grafana.com on start).
|
||||
dashboardProviders:
|
||||
dashboardproviders.yaml:
|
||||
apiVersion: 1
|
||||
providers:
|
||||
- name: default
|
||||
orgId: 1
|
||||
folder: ""
|
||||
type: file
|
||||
disableDeletion: false
|
||||
editable: true
|
||||
options:
|
||||
path: /var/lib/grafana/dashboards/default
|
||||
dashboards:
|
||||
default:
|
||||
node-exporter-full:
|
||||
gnetId: 1860 # Node Exporter Full (CPU/RAM/disk/net + temps), pick host via $instance
|
||||
revision: 45
|
||||
datasource: Prometheus
|
||||
proxmox:
|
||||
gnetId: 10347 # Proxmox via Prometheus (pve-exporter)
|
||||
revision: 5
|
||||
datasource: Prometheus
|
||||
|
||||
# Node metrics for the k3s host itself.
|
||||
nodeExporter:
|
||||
enabled: true
|
||||
kube-state-metrics:
|
||||
enabled: true
|
||||
|
||||
prometheus:
|
||||
prometheusSpec:
|
||||
retention: 15d
|
||||
scrapeInterval: 30s
|
||||
# Persist metrics on the local-path PVC.
|
||||
storageSpec:
|
||||
volumeClaimTemplate:
|
||||
spec:
|
||||
storageClassName: local-path
|
||||
accessModes: ["ReadWriteOnce"]
|
||||
resources:
|
||||
requests:
|
||||
storage: 10Gi
|
||||
# Pick up ServiceMonitors/PodMonitors from any namespace, not just chart-labelled ones.
|
||||
serviceMonitorSelectorNilUsesHelmValues: false
|
||||
podMonitorSelectorNilUsesHelmValues: false
|
||||
# External targets: Proxmox host + the other VMs + test server.
|
||||
additionalScrapeConfigs:
|
||||
# node_exporter on the Proxmox host and the other servers (installed via ansible).
|
||||
- job_name: node-external
|
||||
static_configs:
|
||||
- targets:
|
||||
- 192.168.50.48:9100 # proxmox host (master)
|
||||
- 192.168.50.70:9100 # minio
|
||||
- 192.168.50.71:9100 # forgejo
|
||||
- 192.168.50.49:9100 # test server
|
||||
labels:
|
||||
group: infra-hosts
|
||||
# Proxmox VE stats via prometheus-pve-exporter (deployed in-cluster).
|
||||
- job_name: proxmox-pve
|
||||
metrics_path: /pve
|
||||
params:
|
||||
module: [default]
|
||||
static_configs:
|
||||
- targets:
|
||||
- 192.168.50.48 # the PVE node to query
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
target_label: __param_target
|
||||
- source_labels: [__param_target]
|
||||
target_label: instance
|
||||
- target_label: __address__
|
||||
replacement: prometheus-pve-exporter.monitoring.svc.cluster.local:9221
|
||||
# ---
|
||||
# To expose Grafana later at grafana.roysland.net, set grafana.ingress in the
|
||||
# values above:
|
||||
# grafana:
|
||||
# ingress:
|
||||
# enabled: true
|
||||
# ingressClassName: traefik
|
||||
# annotations:
|
||||
# cert-manager.io/cluster-issuer: letsencrypt
|
||||
# traefik.ingress.kubernetes.io/router.middlewares: monitoring-https-redirect@kubernetescrd
|
||||
# hosts: [grafana.roysland.net]
|
||||
# tls:
|
||||
# - hosts: [grafana.roysland.net]
|
||||
# secretName: grafana-tls
|
||||
# (and add a Middleware named https-redirect in the monitoring namespace)
|
||||
Loading…
Add table
Add a link
Reference in a new issue