From 7040788d012bb441784df0d56b8510828f449142 Mon Sep 17 00:00:00 2001 From: Marco van Zijl Date: Sun, 9 Nov 2025 19:26:18 +0100 Subject: [PATCH] Update Loki dependency version and remove unused Thanos configurations --- apps/logging/loki/Chart.yaml | 2 +- .../grafana/templates/extra-objects.yaml | 4 - apps/monitoring/grafana/values.yaml | 180 +++++++++++------- apps/monitoring/prometheus/values.yaml | 148 +++++++------- apps/monitoring/thanos/Chart.yaml | 11 -- apps/monitoring/thanos/application.yaml | 30 --- .../thanos/templates/extra-objects.yaml | 4 - apps/monitoring/thanos/values.yaml | 130 ------------- 8 files changed, 178 insertions(+), 331 deletions(-) delete mode 100644 apps/monitoring/grafana/templates/extra-objects.yaml delete mode 100644 apps/monitoring/thanos/Chart.yaml delete mode 100644 apps/monitoring/thanos/application.yaml delete mode 100644 apps/monitoring/thanos/templates/extra-objects.yaml delete mode 100644 apps/monitoring/thanos/values.yaml diff --git a/apps/logging/loki/Chart.yaml b/apps/logging/loki/Chart.yaml index 3cdec7e..b7bb975 100644 --- a/apps/logging/loki/Chart.yaml +++ b/apps/logging/loki/Chart.yaml @@ -7,5 +7,5 @@ appVersion: "3.5.7" dependencies: - name: loki - version: 6.45.2 + version: 6.46.0 repository: https://grafana.github.io/helm-charts diff --git a/apps/monitoring/grafana/templates/extra-objects.yaml b/apps/monitoring/grafana/templates/extra-objects.yaml deleted file mode 100644 index 8dd36ec..0000000 --- a/apps/monitoring/grafana/templates/extra-objects.yaml +++ /dev/null @@ -1,4 +0,0 @@ -{{- range .Values.extraObjects }} ---- -{{ toYaml . }} -{{- end }} diff --git a/apps/monitoring/grafana/values.yaml b/apps/monitoring/grafana/values.yaml index 01cc2bc..62ac8b1 100644 --- a/apps/monitoring/grafana/values.yaml +++ b/apps/monitoring/grafana/values.yaml @@ -1,15 +1,12 @@ grafana: - # Admin credentials + adminUser: admin adminPassword: changeme # TODO: Use secret management - - # Persistence + + # Disable local persistence - using PostgreSQL database persistence: - enabled: true - storageClassName: ceph-block - size: 10Gi - - # Resources + enabled: false + resources: requests: cpu: 100m @@ -17,23 +14,26 @@ grafana: limits: memory: 512Mi - # Datasources + extraSecretMounts: + - name: db-secret + secretName: grafana-pg-cluster-app + mountPath: /secrets/my-db + readOnly: true + datasources: datasources.yaml: apiVersion: 1 datasources: - # Thanos datasource - - name: Thanos + - name: Prometheus type: prometheus access: proxy - url: http://thanos-query-frontend.monitoring.svc.cluster.local:9090 + url: http://prometheus-kube-prometheus-prometheus.monitoring.svc.cluster.local:9090 isDefault: true editable: false jsonData: timeInterval: 30s queryTimeout: 60s - # Loki datasource - name: Loki type: loki access: proxy @@ -42,12 +42,11 @@ grafana: jsonData: maxLines: 1000 derivedFields: - - datasourceUid: Thanos + - datasourceUid: Prometheus matcherRegex: "traceID=(\\w+)" name: TraceID url: "$${__value.raw}" - - # Dashboard providers + dashboardProviders: dashboardproviders.yaml: apiVersion: 1 @@ -68,42 +67,42 @@ grafana: editable: true options: path: /var/lib/grafana/dashboards/kubernetes - - # Preload dashboards + dashboards: default: - # Node exporter dashboard node-exporter: gnetId: 1860 revision: 37 - datasource: Thanos + datasource: Prometheus - # Kubernetes cluster monitoring k8s-cluster: gnetId: 7249 revision: 1 - datasource: Thanos + datasource: Prometheus kubernetes: - # Kubernetes pods k8s-pods: gnetId: 6417 revision: 1 - datasource: Thanos + datasource: Prometheus - # Loki logs dashboard loki-logs: gnetId: 13639 revision: 2 datasource: Loki - - # Grafana config + grafana.ini: server: root_url: https://grafana.noxxos.nl serve_from_sub_path: false - # Authentication - Authentik OIDC + database: + type: postgres + host: "$__file{/secrets/my-db/host}:$__file{/secrets/my-db/port}" + name: "$__file{/secrets/my-db/dbname}" + user: "$__file{/secrets/my-db/user}" + password: "$__file{/secrets/my-db/password}" + auth.generic_oauth: enabled: false # Enable after configuring secret name: Authentik @@ -127,51 +126,98 @@ grafana: users: auto_assign_org: true auto_assign_org_role: Viewer - - # Service Monitor + serviceMonitor: - enabled: true - - # Plugins + enabled: false + plugins: - grafana-piechart-panel - grafana-clock-panel -# Gateway API HTTPRoute -extraObjects: - # ReferenceGrant - - apiVersion: gateway.networking.k8s.io/v1beta1 - kind: ReferenceGrant - metadata: - name: traefik-gateway-access - namespace: monitoring - spec: - from: - - group: gateway.networking.k8s.io - kind: HTTPRoute - namespace: monitoring - to: - - group: "" - kind: Service - - # Grafana HTTPRoute - - apiVersion: gateway.networking.k8s.io/v1 - kind: HTTPRoute - metadata: - name: grafana - namespace: monitoring - spec: + route: + main: + enabled: true + hostnames: + - grafana.noxxos.nl parentRefs: - name: traefik-gateway namespace: traefik sectionName: websecure - hostnames: - - "grafana.noxxos.nl" - rules: - - matches: - - path: - type: PathPrefix - value: / - backendRefs: - - name: grafana - port: 80 + +extraObjects: + - apiVersion: postgresql.cnpg.io/v1 + kind: Cluster + metadata: + name: grafana-pg-cluster + namespace: monitoring + spec: + instances: 2 + postgresql: + parameters: + max_connections: "20" + shared_buffers: "25MB" + effective_cache_size: "75MB" + maintenance_work_mem: "6400kB" + checkpoint_completion_target: "0.9" + wal_buffers: "768kB" + default_statistics_target: "100" + random_page_cost: "1.1" + effective_io_concurrency: "300" + work_mem: "640kB" + huge_pages: "off" + max_wal_size: "128MB" + bootstrap: + initdb: + database: grafana + owner: grafana + storage: + size: 1Gi + storageClass: ceph-block + resources: + requests: + cpu: 100m + memory: 100Mi + limits: + memory: 512Mi + backup: + method: plugin + pluginConfiguration: + name: barman-cloud.cloudnative-pg.io + retentionPolicy: "30d" + barmanObjectStore: + destinationPath: s3://postgresql-backups/grafana + endpointURL: http://rook-ceph-rgw-ceph-objectstore.rook-ceph.svc:80 + s3Credentials: + accessKeyId: + name: grafana-pg-backup-creds + key: AWS_ACCESS_KEY_ID + secretAccessKey: + name: grafana-pg-backup-creds + key: AWS_SECRET_ACCESS_KEY + wal: + compression: bzip2 + data: + compression: bzip2 + scheduledBackups: + - name: daily-backup + schedule: "0 2 * * *" # 2 AM daily + backupOwnerReference: self + - apiVersion: objectbucket.io/v1alpha1 + kind: ObjectBucketClaim + metadata: + name: grafana-pg-backups + namespace: monitoring + spec: + bucketName: postgresql-backups + storageClassName: ceph-bucket + additionalConfig: + maxSize: "50Gi" + - apiVersion: v1 + kind: Secret + metadata: + name: grafana-pg-backup-creds + namespace: monitoring + type: Opaque + stringData: + AWS_ACCESS_KEY_ID: placeholder + AWS_SECRET_ACCESS_KEY: placeholder \ No newline at end of file diff --git a/apps/monitoring/prometheus/values.yaml b/apps/monitoring/prometheus/values.yaml index ae39744..3c2c27f 100644 --- a/apps/monitoring/prometheus/values.yaml +++ b/apps/monitoring/prometheus/values.yaml @@ -1,5 +1,26 @@ kube-prometheus-stack: - # Prometheus Operator + + crds: + enabled: true + + defaultRules: + create: false + + alertmanager: + enabled: false + + grafana: + enabled: false + + kubeProxy: + enabled: false + + kubeControllerManager: + enabled: false + + kubeEtcd: + enabled: false + prometheusOperator: enabled: true resources: @@ -8,25 +29,56 @@ kube-prometheus-stack: memory: 128Mi limits: memory: 256Mi - - # Prometheus configuration + networkPolicy: + enabled: true + flavor: Cilium + prometheus: enabled: true + networkPolicy: + enabled: true + flavor: Cilium + cilium: {} + # Disable Thanos integration + thanosService: + enabled: false + thanosServiceMonitor: + enabled: false + thanosServiceExternal: + enabled: false + thanosIngress: + enabled: false + + route: + main: + enabled: true + hostnames: + - prometheus.noxxos.nl + parentRefs: + - name: traefik-gateway + namespace: traefik + sectionName: websecure + serviceMonitor: + selfMonitor: false prometheusSpec: - # Retention - retention: 24h - retentionSize: 15GB + # Enable compaction (was disabled for Thanos) + disableCompaction: false + scrapeInterval: 30s - # Resources + # 3 months retention (~90 days) + retention: 90d + retentionSize: 100GB + + replicas: 1 resources: requests: - cpu: 200m - memory: 1Gi + cpu: 100m + memory: 400Mi limits: memory: 2Gi - # Storage + # Increased storage for 3 month retention storageSpec: volumeClaimTemplate: spec: @@ -34,26 +86,10 @@ kube-prometheus-stack: accessModes: ["ReadWriteOnce"] resources: requests: - storage: 20Gi - - # Thanos sidecar configuration - thanos: - image: quay.io/thanos/thanos:v0.37.2 - version: v0.37.2 - objectStorageConfig: - name: thanos-objstore-secret - key: objstore.yml - - # External labels for Thanos - externalLabels: - cluster: homelab - prometheus: monitoring/prometheus - - # Replicas - replicas: 1 - replicaExternalLabelName: prometheus_replica + storage: 150Gi # Service monitors + scrapeConfigSelectorNilUsesHelmValues: false serviceMonitorSelectorNilUsesHelmValues: false podMonitorSelectorNilUsesHelmValues: false ruleSelectorNilUsesHelmValues: false @@ -61,30 +97,6 @@ kube-prometheus-stack: # Additional scrape configs additionalScrapeConfigs: [] - # Alertmanager - alertmanager: - enabled: true - alertmanagerSpec: - replicas: 1 - storage: - volumeClaimTemplate: - spec: - storageClassName: ceph-block - accessModes: ["ReadWriteOnce"] - resources: - requests: - storage: 5Gi - resources: - requests: - cpu: 50m - memory: 128Mi - limits: - memory: 256Mi - - # Grafana (disabled - using separate Grafana deployment) - grafana: - enabled: false - # Node Exporter nodeExporter: enabled: true @@ -104,35 +116,3 @@ kube-prometheus-stack: memory: 128Mi limits: memory: 256Mi - - # Default rules - defaultRules: - create: true - rules: - alertmanager: true - etcd: false - configReloaders: true - general: true - k8s: true - kubeApiserverAvailability: true - kubeApiserverBurnrate: true - kubeApiserverHistogram: true - kubeApiserverSlos: true - kubeControllerManager: true - kubelet: true - kubeProxy: true - kubePrometheusGeneral: true - kubePrometheusNodeRecording: true - kubernetesApps: true - kubernetesResources: true - kubernetesStorage: true - kubernetesSystem: true - kubeSchedulerAlerting: true - kubeSchedulerRecording: true - kubeStateMetrics: true - network: true - node: true - nodeExporterAlerting: true - nodeExporterRecording: true - prometheus: true - prometheusOperator: true diff --git a/apps/monitoring/thanos/Chart.yaml b/apps/monitoring/thanos/Chart.yaml deleted file mode 100644 index 7fdcbeb..0000000 --- a/apps/monitoring/thanos/Chart.yaml +++ /dev/null @@ -1,11 +0,0 @@ -apiVersion: v2 -name: thanos -description: Thanos distributed metrics wrapper chart -type: application -version: 1.0.0 -appVersion: "0.40.1" - -dependencies: - - name: thanos - version: 1.22.0 - repository: oci://ghcr.io/stevehipwell/helm-charts diff --git a/apps/monitoring/thanos/application.yaml b/apps/monitoring/thanos/application.yaml deleted file mode 100644 index 15e24ca..0000000 --- a/apps/monitoring/thanos/application.yaml +++ /dev/null @@ -1,30 +0,0 @@ -apiVersion: argoproj.io/v1alpha1 -kind: Application -metadata: - name: thanos - namespace: argocd - annotations: - argocd.argoproj.io/sync-wave: "1" - finalizers: - - resources-finalizer.argocd.argoproj.io -spec: - project: default - source: - repoURL: https://git.mvzijl.nl/marco/veda.git - targetRevision: applicationset-rewrite - path: apps/monitoring/thanos - helm: - releaseName: thanos - valueFiles: - - values.yaml - destination: - server: https://kubernetes.default.svc - namespace: monitoring - syncPolicy: - automated: - prune: true - selfHeal: true - syncOptions: - - CreateNamespace=true - - ServerSideApply=true - - SkipDryRunOnMissingResource=true diff --git a/apps/monitoring/thanos/templates/extra-objects.yaml b/apps/monitoring/thanos/templates/extra-objects.yaml deleted file mode 100644 index 8dd36ec..0000000 --- a/apps/monitoring/thanos/templates/extra-objects.yaml +++ /dev/null @@ -1,4 +0,0 @@ -{{- range .Values.extraObjects }} ---- -{{ toYaml . }} -{{- end }} diff --git a/apps/monitoring/thanos/values.yaml b/apps/monitoring/thanos/values.yaml deleted file mode 100644 index 2c7bf30..0000000 --- a/apps/monitoring/thanos/values.yaml +++ /dev/null @@ -1,130 +0,0 @@ -thanos: - # Object storage configuration - objstoreConfig: - create: false # We create the secret via extraObjects - name: thanos-objstore-secret - key: objstore.yml - - # Image configuration - image: - registry: quay.io - repository: thanos/thanos - tag: v0.40.1 - - # Query component - query: - enabled: true - replicaCount: 2 - resources: - requests: - cpu: 100m - memory: 256Mi - limits: - memory: 512Mi - stores: - - dnssrv+_grpc._tcp.thanos-storegateway.monitoring.svc.cluster.local - - dnssrv+_grpc._tcp.thanos-receive.monitoring.svc.cluster.local - - # Query Frontend - queryFrontend: - enabled: true - replicaCount: 1 - resources: - requests: - cpu: 50m - memory: 128Mi - limits: - memory: 256Mi - - # Store Gateway - storegateway: - enabled: true - replicaCount: 1 - persistence: - enabled: true - storageClass: ceph-block - size: 10Gi - resources: - requests: - cpu: 100m - memory: 512Mi - limits: - memory: 1Gi - - # Compactor - compactor: - enabled: true - persistence: - enabled: true - storageClass: ceph-block - size: 10Gi - retentionResolutionRaw: 14d - retentionResolution5m: 90d - retentionResolution1h: 2y - resources: - requests: - cpu: 100m - memory: 512Mi - limits: - memory: 1Gi - extraFlags: - - --deduplication.replica-label=prometheus_replica - - --deduplication.replica-label=replica - - --downsampling.disable=false - - --compact.enable-vertical-compaction - - # Receive (for remote write from Prometheus) - receive: - enabled: true - replicaCount: 1 - persistence: - enabled: true - storageClass: ceph-block - size: 20Gi - resources: - requests: - cpu: 100m - memory: 512Mi - limits: - memory: 1Gi - - # Metrics and caching - # Note: Memcached configuration would be added here if using external caching - - # Metrics - metrics: - enabled: true - serviceMonitor: - enabled: true - -# S3 Bucket and credentials provisioning -extraObjects: - # ObjectBucketClaim for Thanos metrics - - apiVersion: objectbucket.io/v1alpha1 - kind: ObjectBucketClaim - metadata: - name: thanos-metrics - namespace: monitoring - spec: - bucketName: thanos-metrics - storageClassName: ceph-bucket - additionalConfig: - maxSize: "500Gi" - - # Secret with S3 credentials (will be populated by Rook) - # This is a placeholder - actual credentials come from the OBC - - apiVersion: v1 - kind: Secret - metadata: - name: thanos-objstore-secret - namespace: monitoring - type: Opaque - stringData: - objstore.yml: |- - type: S3 - config: - bucket: thanos-metrics - endpoint: rook-ceph-rgw-ceph-objectstore.rook-ceph.svc:80 - insecure: true - access_key: ${AWS_ACCESS_KEY_ID} - secret_key: ${AWS_SECRET_ACCESS_KEY}