Added monitoring and loggin stack

This commit is contained in:
Marco van Zijl 2025-11-09 17:12:33 +01:00
parent 81b69bc8e3
commit cac51f4416
40 changed files with 1665 additions and 148 deletions

View File

View File

View File

@ -0,0 +1,132 @@
# Mirroring CloudNativePG Barman Plugin
## Setup Mirror Repository
1. **Clone the upstream repository:**
```bash
cd /tmp
git clone --mirror https://github.com/cloudnative-pg/plugin-barman-cloud.git
cd plugin-barman-cloud.git
```
2. **Push to your Git server:**
```bash
# Create repo on your Git server first (git.mvzijl.nl)
# Then push:
git push --mirror https://git.mvzijl.nl/marco/plugin-barman-cloud.git
```
3. **Set up periodic sync (optional):**
```bash
# Create a script to sync weekly
cat > /usr/local/bin/sync-barman-plugin.sh <<'EOF'
#!/bin/bash
cd /var/git/mirrors/plugin-barman-cloud.git
git fetch --prune origin
git push --mirror https://git.mvzijl.nl/marco/plugin-barman-cloud.git
EOF
chmod +x /usr/local/bin/sync-barman-plugin.sh
# Add to cron (weekly on Sunday at 2 AM)
echo "0 2 * * 0 /usr/local/bin/sync-barman-plugin.sh" | crontab -
```
## Update Application Reference
After mirroring, update the application.yaml to use your mirror:
```yaml
spec:
source:
repoURL: https://git.mvzijl.nl/marco/plugin-barman-cloud.git
targetRevision: main # or specific tag like v1.0.0
path: deployments/manifests
```
## Version Pinning Strategy
Instead of tracking `main`, pin to specific releases:
```yaml
spec:
source:
repoURL: https://git.mvzijl.nl/marco/plugin-barman-cloud.git
targetRevision: v1.0.0 # Pin to specific version
path: deployments/manifests
```
This gives you:
- ✅ Predictable deployments
- ✅ Controlled updates
- ✅ Rollback capability
## Update Process
When a new version is released:
1. **Check upstream for updates:**
```bash
cd /var/git/mirrors/plugin-barman-cloud.git
git fetch origin
git tag -l
```
2. **Review changes:**
```bash
git log HEAD..origin/main --oneline
git diff HEAD..origin/main deployments/manifests/
```
3. **Sync to your mirror:**
```bash
git push --mirror https://git.mvzijl.nl/marco/plugin-barman-cloud.git
```
4. **Update application.yaml:**
```yaml
targetRevision: v1.1.0 # Update to new version
```
5. **Test and deploy:**
```bash
git add apps/cloudnative-pg-plugin/application.yaml
git commit -m "Update barman plugin to v1.1.0"
git push
```
## Monitoring Upstream
Subscribe to releases:
- GitHub: Watch → Custom → Releases only
- RSS: `https://github.com/cloudnative-pg/plugin-barman-cloud/releases.atom`
## Alternative: Subtree Approach
Instead of mirroring, you could use git subtree:
```bash
cd /Users/marco/Documents/Hobby/Veda/talos
git subtree add --prefix vendor/plugin-barman-cloud \
https://github.com/cloudnative-pg/plugin-barman-cloud.git main --squash
# Then reference in application:
# path: vendor/plugin-barman-cloud/deployments/manifests
```
Update when needed:
```bash
git subtree pull --prefix vendor/plugin-barman-cloud \
https://github.com/cloudnative-pg/plugin-barman-cloud.git main --squash
```
## Recommended Approach
For your setup, I recommend:
1. **Mirror to your Git server** at `git.mvzijl.nl/marco/plugin-barman-cloud`
2. **Pin to specific versions** (not `main`)
3. **Review updates** before applying
4. **Set up monitoring** for new releases
This gives you the best balance of control and maintainability.

View File

@ -0,0 +1,301 @@
# CloudNativePG Barman-Cloud Plugin
## Overview
The Barman Cloud Plugin provides object storage backup capabilities for CloudNativePG using the Barman toolset.
**Important**: As of CloudNativePG v1.26+, the native `barmanObjectStore` backup method is **deprecated**. You should use this plugin instead.
## Why This Plugin is Required
From the CloudNativePG 1.27 documentation:
> Starting with version 1.26, native backup and recovery capabilities are being progressively phased out of the core operator and moved to official CNPG-I plugins.
The built-in barman integration (`method: barmanObjectStore`) is deprecated and will be removed in future versions. This plugin provides the official replacement.
## What This Plugin Provides
- ✅ **WAL archiving** to S3-compatible object stores
- ✅ **Base backups** with compression and encryption
- ✅ **Point-in-time recovery (PITR)**
- ✅ **Retention policies** for automated cleanup
- ✅ **Backup from standby** servers
- ✅ **Support for multiple storage backends**: S3, Azure Blob, GCS, MinIO, Ceph S3 (RGW)
## Installation
This application deploys the plugin to the `cnpg-system` namespace where the CloudNativePG operator runs.
The plugin will be available for all PostgreSQL clusters managed by CloudNativePG.
## Configuration in PostgreSQL Clusters
### Using the Plugin (New Method)
```yaml
apiVersion: postgresql.cnpg.io/v1
kind: Cluster
metadata:
name: my-cluster
spec:
backup:
target: prefer-standby
# Use the plugin method (required for v1.26+)
method: plugin
# Plugin configuration
pluginConfiguration:
name: barman-cloud.cloudnative-pg.io
# S3 configuration
barmanObjectStore:
destinationPath: s3://postgres-backups/
endpointURL: http://rook-ceph-rgw-ceph-objectstore.rook-ceph.svc:80
# Credentials
s3Credentials:
accessKeyId:
name: backup-credentials
key: ACCESS_KEY_ID
secretAccessKey:
name: backup-credentials
key: ACCESS_SECRET_KEY
# Compression and parallelism
data:
compression: bzip2
jobs: 2
immediateCheckpoint: true
wal:
compression: bzip2
maxParallel: 2
# Retention policy
retentionPolicy: "30d"
# Tags for organization
tags:
environment: "production"
cluster: "my-cluster"
```
### Old Method (Deprecated)
```yaml
# ❌ DON'T USE - This is deprecated
spec:
backup:
method: barmanObjectStore # Deprecated!
barmanObjectStore:
# ... config
```
## WAL Archiving
The plugin also handles WAL archiving. Configure it at the cluster level:
```yaml
apiVersion: postgresql.cnpg.io/v1
kind: Cluster
metadata:
name: my-cluster
spec:
backup:
# Backup configuration (as above)
...
# WAL archiving uses the same plugin configuration
# Automatically enabled when backup is configured
```
## Scheduled Backups
Create scheduled backups using the plugin:
```yaml
apiVersion: postgresql.cnpg.io/v1
kind: ScheduledBackup
metadata:
name: daily-backup
spec:
schedule: "0 0 2 * * *" # 2 AM daily
backupOwnerReference: self
cluster:
name: my-cluster
# Use plugin method
method: plugin
# Plugin configuration (or inherits from cluster)
pluginConfiguration:
name: barman-cloud.cloudnative-pg.io
```
## On-Demand Backups
Trigger manual backups:
```yaml
apiVersion: postgresql.cnpg.io/v1
kind: Backup
metadata:
name: manual-backup
spec:
cluster:
name: my-cluster
method: plugin
pluginConfiguration:
name: barman-cloud.cloudnative-pg.io
```
Or use kubectl:
```bash
kubectl cnpg backup my-cluster --method plugin
```
## Retention Policies
The plugin supports advanced retention policies:
```yaml
pluginConfiguration:
barmanObjectStore:
retentionPolicy: "30d" # Keep backups for 30 days
# or
# retentionPolicy: "7 days"
# retentionPolicy: "4 weeks"
# retentionPolicy: "3 months"
```
## Supported Storage Backends
### AWS S3
```yaml
destinationPath: s3://bucket-name/
# endpointURL not needed for AWS S3
```
### Ceph S3 (RGW) - Your Setup
```yaml
destinationPath: s3://postgres-backups/
endpointURL: http://rook-ceph-rgw-ceph-objectstore.rook-ceph.svc:80
```
### Azure Blob Storage
```yaml
destinationPath: https://storageaccount.blob.core.windows.net/container/
```
### Google Cloud Storage
```yaml
destinationPath: gs://bucket-name/
```
### MinIO
```yaml
destinationPath: s3://bucket-name/
endpointURL: http://minio:9000
```
## Verification
After deploying, verify the plugin is running:
```bash
# Check plugin deployment
kubectl get deployment -n cnpg-system | grep plugin
# Check plugin pods
kubectl get pods -n cnpg-system -l app=barman-cloud-plugin
# Verify plugin is registered
kubectl get configmap -n cnpg-system cnpg-plugin-registry -o yaml
```
## Troubleshooting
### Plugin Not Found
If you see errors like "plugin not found":
```bash
# Check if plugin is deployed
kubectl get pods -n cnpg-system -l app=barman-cloud-plugin
# Check operator logs
kubectl logs -n cnpg-system -l app.kubernetes.io/name=cloudnative-pg
```
### Backup Failures
```bash
# Check backup status
kubectl get backup -n <namespace>
# Check backup logs
kubectl describe backup <backup-name> -n <namespace>
# Check PostgreSQL pod logs
kubectl logs -n <namespace> <postgres-pod> | grep -i backup
```
### WAL Archiving Issues
```bash
# Check WAL archive status
kubectl exec -it -n <namespace> <postgres-pod> -- \
psql -c "SELECT * FROM pg_stat_archiver;"
# Check plugin logs
kubectl logs -n cnpg-system -l app=barman-cloud-plugin
```
## Migration from Built-in to Plugin
If you're migrating from the deprecated `barmanObjectStore` method:
1. **Deploy this plugin application**
2. **Update your Cluster resource**:
```yaml
spec:
backup:
method: plugin # Change from barmanObjectStore
pluginConfiguration:
name: barman-cloud.cloudnative-pg.io
barmanObjectStore:
# Keep same configuration
```
3. **Existing backups remain accessible** - the plugin can read backups created by the built-in method
## Best Practices
1. ✅ **Always use the plugin** for CloudNativePG v1.26+
2. ✅ **Configure retention policies** to manage storage costs
3. ✅ **Enable backup from standby** to reduce primary load
4. ✅ **Use compression** (bzip2) to reduce storage usage
5. ✅ **Set up scheduled backups** for automated protection
6. ✅ **Test recovery procedures** regularly
7. ✅ **Monitor backup status** with Prometheus metrics
8. ✅ **Tag backups** for easy identification and filtering
## Next Steps
1. Deploy this application: `git add . && git commit && git push`
2. Wait for ArgoCD to sync
3. Update your PostgreSQL Cluster to use `method: plugin`
4. Create an S3 bucket for backups (ObjectBucketClaim)
5. Configure backup credentials
6. Test with an on-demand backup
## Additional Resources
- [Barman Cloud Plugin Documentation](https://cloudnative-pg.io/plugin-barman-cloud/)
- [CloudNativePG Backup Guide](https://cloudnative-pg.io/documentation/1.27/backup/)
- [CNPG-I Plugin Architecture](https://cloudnative-pg.io/documentation/1.27/cnpg_i/)

View File

@ -0,0 +1,32 @@
apiVersion: argoproj.io/v1alpha1
kind: Application
metadata:
name: cloudnative-pg-plugin
namespace: argocd
annotations:
argocd.argoproj.io/sync-wave: "0"
finalizers:
- resources-finalizer.argocd.argoproj.io
spec:
project: default
source:
repoURL: https://git.mvzijl.nl/marco/plugin-barman-cloud.git
targetRevision: 0.9.0
path: deployments/manifests
destination:
server: https://kubernetes.default.svc
namespace: cnpg-system
syncPolicy:
automated:
prune: true
selfHeal: true
syncOptions:
- CreateNamespace=false
- ServerSideApply=true
# Ensure operator is healthy before deploying plugin
retry:
limit: 5
backoff:
duration: 5s
factor: 2
maxDuration: 3m

View File

View File

@ -47,98 +47,13 @@ CloudNativePG is a Kubernetes operator that manages PostgreSQL clusters using Ku
### Example Cluster (Commented Out)
The `values.yaml` includes a commented example cluster configuration with:
- **Storage**: `local-path` StorageClass (for development)
- **Backup**: Barman-cloud plugin with S3 (Ceph RGW) backend
- **Note**: See "Storage Considerations" section below
## ⚠️ Storage Considerations
### Local Path vs Ceph Block
The example cluster uses `local-path` StorageClass, which is suitable for:
- ✅ **Development/Testing**: Quick setup, no Ceph dependency
- ✅ **Single-node scenarios**: When HA isn't required
- ✅ **Learning/Experimentation**: Testing PostgreSQL features
**For production use, change to `ceph-block`:**
```yaml
storage:
storageClass: ceph-block # Instead of local-path
size: 50Gi
```
### Why Ceph Block for Production?
| Feature | local-path | ceph-block |
|---------|-----------|------------|
| **High Availability** | ❌ No | ✅ Yes |
| **Data Replication** | ❌ No | ✅ 2x copies |
| **Pod Mobility** | ❌ Pinned to node | ✅ Can move |
| **Snapshots** | ❌ No | ✅ Yes |
| **Auto Resize** | ❌ No | ✅ Yes |
| **Node Failure** | ❌ Data unavailable | ✅ Survives |
### Hybrid Approach (Recommended for Dev)
Even with local-path storage, the S3 backup provides safety:
- **Primary storage**: local-path (fast, simple)
- **Backups**: Ceph S3 (safe, replicated, off-node)
- **Recovery**: Restore from S3 if node fails
This gives you:
- ✅ Point-in-time recovery
- ✅ Off-node backup storage
- ✅ Disaster recovery capability
- ✅ Fast local performance
- ⚠️ But no automatic HA
## Barman-Cloud Backup Plugin
CloudNativePG uses the modern barman-cloud toolset for backups.
### Configuration Features:
```yaml
backup:
barmanObjectStore:
# Parallel processing
data:
compression: bzip2
jobs: 2 # Parallel compression threads
wal:
compression: bzip2
maxParallel: 2 # Parallel WAL uploads
# Metadata tags
tags:
environment: "development"
managed-by: "cloudnative-pg"
# Backup lineage tracking
historyTags:
environment: "development"
```
### Plugin Benefits:
- ✅ **Better S3 compatibility**: Works with all S3-compatible stores
- ✅ **Improved parallelism**: Faster backups for large databases
- ✅ **Enhanced error handling**: Better retry logic
- ✅ **Cloud-native design**: Optimized for object storage
- ✅ **Metadata tagging**: Better backup organization
### Backup Strategy:
1. **Continuous WAL archiving**: Real-time transaction logs to S3
2. **Scheduled full backups**: Complete database snapshots
3. **Point-in-time recovery**: Restore to any timestamp
4. **Retention policies**: Automatic cleanup of old backups
The `values.yaml` includes a commented example cluster configuration. See "Creating Your First Cluster" below.
## Creating Your First Cluster
### Option 1: Using extraObjects in values.yaml (Development)
### Option 1: Using extraObjects in values.yaml
Uncomment the `extraObjects` section in `values.yaml` for a development cluster:
Uncomment the `extraObjects` section in `values.yaml` and customize:
```yaml
extraObjects:
@ -149,36 +64,14 @@ extraObjects:
namespace: cnpg-system
spec:
instances: 2 # 1 primary + 1 replica
# Development: local-path for fast local storage
storage:
size: 50Gi
storageClass: local-path
# Backup to Ceph S3 for safety
backup:
retentionPolicy: "30d"
barmanObjectStore:
destinationPath: s3://postgres-backups/
endpointURL: http://rook-ceph-rgw-ceph-objectstore.rook-ceph.svc:80
s3Credentials:
accessKeyId:
name: postgres-backup-credentials
key: ACCESS_KEY_ID
secretAccessKey:
name: postgres-backup-credentials
key: ACCESS_SECRET_KEY
data:
compression: bzip2
jobs: 2
wal:
compression: bzip2
maxParallel: 2
storageClass: ceph-block
```
### Option 2: Separate Application (Production)
### Option 2: Separate Application
For production, create a separate ArgoCD Application with ceph-block storage:
For production, create a separate ArgoCD Application for each database cluster:
```bash
mkdir -p apps/databases/my-app-db
@ -199,7 +92,6 @@ spec:
max_connections: "200"
shared_buffers: "256MB"
# Production: ceph-block for HA
storage:
size: 100Gi
storageClass: ceph-block
@ -207,7 +99,6 @@ spec:
monitoring:
enablePodMonitor: true
# Barman-cloud backup configuration
backup:
retentionPolicy: "30d"
barmanObjectStore:
@ -222,13 +113,6 @@ spec:
key: ACCESS_SECRET_KEY
data:
compression: bzip2
jobs: 2 # Parallel compression
wal:
compression: bzip2
maxParallel: 2 # Parallel WAL uploads
tags:
environment: "production"
application: "my-app"
wal:
compression: bzip2
```

View File

@ -41,26 +41,14 @@ cloudnative-pg:
# effective_io_concurrency: "300"
# monitoring:
# enablePodMonitor: true
#
# # Use local-path-provisioner for storage
# storage:
# size: 50Gi
# storageClass: local-path
#
# # Backup configuration using new plugin system
# storageClass: ceph-block
# backup:
# retentionPolicy: "30d"
#
# # Volume for barman backups (uses same StorageClass as main storage)
# volumeSnapshot:
# className: local-path
#
# # S3 backup using barman-cloud plugin
# barmanObjectStore:
# destinationPath: s3://postgres-backups/
# endpointURL: http://rook-ceph-rgw-ceph-objectstore.rook-ceph.svc:80
#
# # S3 credentials reference
# s3Credentials:
# accessKeyId:
# name: postgres-backup-credentials
@ -68,20 +56,7 @@ cloudnative-pg:
# secretAccessKey:
# name: postgres-backup-credentials
# key: ACCESS_SECRET_KEY
#
# # Compression settings
# data:
# compression: bzip2
# jobs: 2
# wal:
# compression: bzip2
# maxParallel: 2
#
# # Tags for backup organization
# tags:
# environment: "development"
# managed-by: "cloudnative-pg"
#
# # Backup history and retention
# historyTags:
# environment: "development"

0
apps/harbor/README.md Normal file
View File

View File

View File

@ -0,0 +1,11 @@
apiVersion: v2
name: loki
description: Grafana Loki logging stack wrapper chart
type: application
version: 1.0.0
appVersion: "3.5.7"
dependencies:
- name: loki
version: 6.45.2
repository: https://grafana.github.io/helm-charts

View File

@ -0,0 +1,30 @@
apiVersion: argoproj.io/v1alpha1
kind: Application
metadata:
name: loki
namespace: argocd
annotations:
argocd.argoproj.io/sync-wave: "1"
finalizers:
- resources-finalizer.argocd.argoproj.io
spec:
project: default
source:
repoURL: https://git.mvzijl.nl/marco/veda.git
targetRevision: applicationset-rewrite
path: apps/logging/loki
helm:
releaseName: loki
valueFiles:
- values.yaml
destination:
server: https://kubernetes.default.svc
namespace: logging
syncPolicy:
automated:
prune: true
selfHeal: true
syncOptions:
- CreateNamespace=true
- ServerSideApply=true
- SkipDryRunOnMissingResource=true

View File

@ -0,0 +1,4 @@
{{- range .Values.extraObjects }}
---
{{ toYaml . }}
{{- end }}

View File

@ -0,0 +1,152 @@
loki:
# Single binary deployment mode
deploymentMode: SingleBinary
loki:
# Authentication
auth_enabled: false
# Common configuration
commonConfig:
replication_factor: 1
# Storage configuration
schemaConfig:
configs:
- from: "2024-01-01"
store: tsdb
object_store: s3
schema: v13
index:
prefix: loki_index_
period: 24h
# Storage backend configuration
storage:
type: s3
bucketNames:
chunks: loki-logs
ruler: loki-logs
admin: loki-logs
s3:
endpoint: rook-ceph-rgw-ceph-objectstore.rook-ceph.svc:80
region: us-east-1
insecure: true
s3ForcePathStyle: true
accessKeyId: ${AWS_ACCESS_KEY_ID}
secretAccessKey: ${AWS_SECRET_ACCESS_KEY}
# Limits and retention
limits_config:
retention_period: 90d
ingestion_rate_mb: 10
ingestion_burst_size_mb: 20
max_query_series: 10000
max_query_parallelism: 32
reject_old_samples: true
reject_old_samples_max_age: 168h
# Compactor configuration for retention
compactor:
working_directory: /var/loki/compactor
compaction_interval: 10m
retention_enabled: true
retention_delete_delay: 2h
retention_delete_worker_count: 150
# Storage config
storage_config:
tsdb_shipper:
active_index_directory: /var/loki/tsdb-index
cache_location: /var/loki/tsdb-cache
shared_store: s3
# Hedging requests
hedging:
at: 250ms
max_per_second: 20
up_to: 3
# Query configuration
query_scheduler:
max_outstanding_requests_per_tenant: 2048
# Frontend configuration
frontend:
max_outstanding_per_tenant: 2048
# Single binary configuration
singleBinary:
replicas: 1
persistence:
enabled: true
storageClass: ceph-block
size: 10Gi
resources:
requests:
cpu: 100m
memory: 256Mi
limits:
memory: 1Gi
extraEnv:
- name: AWS_ACCESS_KEY_ID
valueFrom:
secretKeyRef:
name: loki-objstore-secret
key: AWS_ACCESS_KEY_ID
- name: AWS_SECRET_ACCESS_KEY
valueFrom:
secretKeyRef:
name: loki-objstore-secret
key: AWS_SECRET_ACCESS_KEY
# Gateway
gateway:
enabled: true
replicas: 1
resources:
requests:
cpu: 50m
memory: 128Mi
limits:
memory: 256Mi
# Monitoring
monitoring:
selfMonitoring:
enabled: true
grafanaAgent:
installOperator: false
serviceMonitor:
enabled: true
# Service configuration
service:
type: ClusterIP
# S3 Bucket and credentials provisioning
extraObjects:
# ObjectBucketClaim for Loki logs
- apiVersion: objectbucket.io/v1alpha1
kind: ObjectBucketClaim
metadata:
name: loki-logs
namespace: logging
spec:
bucketName: loki-logs
storageClassName: ceph-bucket
additionalConfig:
maxSize: "200Gi"
# Secret with S3 credentials (populated by Rook from OBC)
- apiVersion: v1
kind: Secret
metadata:
name: loki-objstore-secret
namespace: logging
type: Opaque
stringData:
AWS_ACCESS_KEY_ID: placeholder
AWS_SECRET_ACCESS_KEY: placeholder

View File

@ -0,0 +1,11 @@
apiVersion: v2
name: promtail
description: Promtail log collection agent wrapper chart
type: application
version: 1.0.0
appVersion: "3.3.2"
dependencies:
- name: promtail
version: 6.17.1
repository: https://grafana.github.io/helm-charts

View File

@ -0,0 +1,29 @@
apiVersion: argoproj.io/v1alpha1
kind: Application
metadata:
name: promtail
namespace: argocd
annotations:
argocd.argoproj.io/sync-wave: "3"
finalizers:
- resources-finalizer.argocd.argoproj.io
spec:
project: default
source:
repoURL: https://git.mvzijl.nl/marco/veda.git
targetRevision: applicationset-rewrite
path: apps/logging/promtail
helm:
releaseName: promtail
valueFiles:
- values.yaml
destination:
server: https://kubernetes.default.svc
namespace: logging
syncPolicy:
automated:
prune: true
selfHeal: true
syncOptions:
- CreateNamespace=false
- ServerSideApply=true

View File

@ -0,0 +1,163 @@
promtail:
# DaemonSet configuration
daemonset:
enabled: true
# Resources
resources:
requests:
cpu: 50m
memory: 128Mi
limits:
memory: 256Mi
# Configuration
config:
# Loki endpoint
clients:
- url: http://loki-gateway.logging.svc.cluster.local/loki/api/v1/push
tenant_id: ""
batchwait: 1s
batchsize: 1048576
timeout: 10s
# Positions file (persisted)
positions:
filename: /run/promtail/positions.yaml
# Server config
server:
log_level: info
http_listen_port: 3101
# Scrape configs
scrape_configs:
# Kubernetes pods
- job_name: kubernetes-pods
pipeline_stages:
# Extract log level
- regex:
expression: '(?i)(?P<level>trace|debug|info|warn|warning|error|err|fatal|critical|panic)'
# Parse JSON logs
- json:
expressions:
level: level
timestamp: timestamp
message: message
# Drop high-cardinality labels
- labeldrop:
- pod_uid
- container_id
- image_id
- stream
# Add log level as label (only keep certain levels)
- labels:
level:
kubernetes_sd_configs:
- role: pod
relabel_configs:
# Only scrape running pods
- source_labels: [__meta_kubernetes_pod_phase]
action: keep
regex: Running
# Keep essential labels
- source_labels: [__meta_kubernetes_namespace]
target_label: namespace
- source_labels: [__meta_kubernetes_pod_name]
target_label: pod
- source_labels: [__meta_kubernetes_pod_label_app]
target_label: app
- source_labels: [__meta_kubernetes_pod_container_name]
target_label: container
- source_labels: [__meta_kubernetes_pod_node_name]
target_label: node
# Add cluster label
- replacement: homelab
target_label: cluster
# Drop pods in kube-system namespace (optional)
# - source_labels: [__meta_kubernetes_namespace]
# action: drop
# regex: kube-system
# Container log path
- source_labels: [__meta_kubernetes_pod_uid, __meta_kubernetes_pod_container_name]
target_label: __path__
separator: /
replacement: /var/log/pods/*$1/*.log
# Journald logs (systemd)
- job_name: systemd-journal
journal:
path: /var/log/journal
max_age: 12h
labels:
job: systemd-journal
cluster: homelab
pipeline_stages:
# Parse priority to log level
- match:
selector: '{job="systemd-journal"}'
stages:
- template:
source: level
template: '{{ if eq .PRIORITY "0" }}fatal{{ else if eq .PRIORITY "1" }}alert{{ else if eq .PRIORITY "2" }}crit{{ else if eq .PRIORITY "3" }}error{{ else if eq .PRIORITY "4" }}warning{{ else if eq .PRIORITY "5" }}notice{{ else if eq .PRIORITY "6" }}info{{ else }}debug{{ end }}'
- labels:
level:
relabel_configs:
- source_labels: [__journal__systemd_unit]
target_label: unit
- source_labels: [__journal__hostname]
target_label: node
- source_labels: [__journal_syslog_identifier]
target_label: syslog_identifier
# Volumes
extraVolumes:
- name: journal
hostPath:
path: /var/log/journal
- name: positions
hostPath:
path: /var/lib/promtail/positions
type: DirectoryOrCreate
extraVolumeMounts:
- name: journal
mountPath: /var/log/journal
readOnly: true
- name: positions
mountPath: /run/promtail
# Tolerations to run on all nodes
tolerations:
- effect: NoSchedule
operator: Exists
# Service Monitor
serviceMonitor:
enabled: true
# Update strategy
updateStrategy:
type: RollingUpdate
rollingUpdate:
maxUnavailable: 1

View File

@ -0,0 +1,11 @@
apiVersion: v2
name: grafana
description: Grafana visualization platform wrapper chart
type: application
version: 1.0.0
appVersion: "12.2.1"
dependencies:
- name: grafana
version: 10.1.4
repository: https://grafana.github.io/helm-charts

View File

@ -0,0 +1,38 @@
apiVersion: argoproj.io/v1alpha1
kind: Application
metadata:
name: grafana
namespace: argocd
annotations:
argocd.argoproj.io/sync-wave: "2"
finalizers:
- resources-finalizer.argocd.argoproj.io
spec:
project: default
source:
repoURL: https://git.mvzijl.nl/marco/veda.git
targetRevision: applicationset-rewrite
path: apps/monitoring/grafana
helm:
releaseName: grafana
valueFiles:
- values.yaml
destination:
server: https://kubernetes.default.svc
namespace: monitoring
syncPolicy:
automated:
prune: true
selfHeal: true
syncOptions:
- CreateNamespace=false
- ServerSideApply=true
ignoreDifferences:
- group: gateway.networking.k8s.io
kind: HTTPRoute
jsonPointers:
- /spec/parentRefs/0/group
- /spec/parentRefs/0/kind
- /spec/rules/0/backendRefs/0/group
- /spec/rules/0/backendRefs/0/kind
- /spec/rules/0/backendRefs/0/weight

View File

@ -0,0 +1,4 @@
{{- range .Values.extraObjects }}
---
{{ toYaml . }}
{{- end }}

View File

@ -0,0 +1,177 @@
grafana:
# Admin credentials
adminUser: admin
adminPassword: changeme # TODO: Use secret management
# Persistence
persistence:
enabled: true
storageClassName: ceph-block
size: 10Gi
# Resources
resources:
requests:
cpu: 100m
memory: 256Mi
limits:
memory: 512Mi
# Datasources
datasources:
datasources.yaml:
apiVersion: 1
datasources:
# Thanos datasource
- name: Thanos
type: prometheus
access: proxy
url: http://thanos-query-frontend.monitoring.svc.cluster.local:9090
isDefault: true
editable: false
jsonData:
timeInterval: 30s
queryTimeout: 60s
# Loki datasource
- name: Loki
type: loki
access: proxy
url: http://loki-gateway.logging.svc.cluster.local
editable: false
jsonData:
maxLines: 1000
derivedFields:
- datasourceUid: Thanos
matcherRegex: "traceID=(\\w+)"
name: TraceID
url: "$${__value.raw}"
# Dashboard providers
dashboardProviders:
dashboardproviders.yaml:
apiVersion: 1
providers:
- name: 'default'
orgId: 1
folder: ''
type: file
disableDeletion: false
editable: true
options:
path: /var/lib/grafana/dashboards/default
- name: 'kubernetes'
orgId: 1
folder: 'Kubernetes'
type: file
disableDeletion: false
editable: true
options:
path: /var/lib/grafana/dashboards/kubernetes
# Preload dashboards
dashboards:
default:
# Node exporter dashboard
node-exporter:
gnetId: 1860
revision: 37
datasource: Thanos
# Kubernetes cluster monitoring
k8s-cluster:
gnetId: 7249
revision: 1
datasource: Thanos
kubernetes:
# Kubernetes pods
k8s-pods:
gnetId: 6417
revision: 1
datasource: Thanos
# Loki logs dashboard
loki-logs:
gnetId: 13639
revision: 2
datasource: Loki
# Grafana config
grafana.ini:
server:
root_url: https://grafana.noxxos.nl
serve_from_sub_path: false
# Authentication - Authentik OIDC
auth.generic_oauth:
enabled: true
name: Authentik
client_id: grafana # TODO: Use secret
client_secret: changeme # TODO: Use secret management
scopes: openid profile email
auth_url: https://auth.noxxos.nl/application/o/authorize/
token_url: https://auth.noxxos.nl/application/o/token/
api_url: https://auth.noxxos.nl/application/o/userinfo/
role_attribute_path: contains(groups[*], 'Grafana Admins') && 'Admin' || contains(groups[*], 'Grafana Editors') && 'Editor' || 'Viewer'
allow_sign_up: true
analytics:
reporting_enabled: false
check_for_updates: false
log:
mode: console
level: info
users:
auto_assign_org: true
auto_assign_org_role: Viewer
# Service Monitor
serviceMonitor:
enabled: true
# Plugins
plugins:
- grafana-piechart-panel
- grafana-clock-panel
# Gateway API HTTPRoute
extraObjects:
# ReferenceGrant
- apiVersion: gateway.networking.k8s.io/v1beta1
kind: ReferenceGrant
metadata:
name: traefik-gateway-access
namespace: monitoring
spec:
from:
- group: gateway.networking.k8s.io
kind: HTTPRoute
namespace: monitoring
to:
- group: ""
kind: Service
# Grafana HTTPRoute
- apiVersion: gateway.networking.k8s.io/v1
kind: HTTPRoute
metadata:
name: grafana
namespace: monitoring
spec:
parentRefs:
- name: traefik-gateway
namespace: traefik
sectionName: websecure
hostnames:
- "grafana.noxxos.nl"
rules:
- matches:
- path:
type: PathPrefix
value: /
backendRefs:
- name: grafana
port: 80

View File

@ -0,0 +1,11 @@
apiVersion: v2
name: prometheus
description: Prometheus monitoring stack with Thanos sidecar wrapper chart
type: application
version: 1.0.0
appVersion: "0.86.2"
dependencies:
- name: kube-prometheus-stack
version: 79.4.1
repository: oci://ghcr.io/prometheus-community/charts

View File

@ -0,0 +1,30 @@
apiVersion: argoproj.io/v1alpha1
kind: Application
metadata:
name: prometheus
namespace: argocd
annotations:
argocd.argoproj.io/sync-wave: "2"
finalizers:
- resources-finalizer.argocd.argoproj.io
spec:
project: default
source:
repoURL: https://git.mvzijl.nl/marco/veda.git
targetRevision: applicationset-rewrite
path: apps/monitoring/prometheus
helm:
releaseName: prometheus
valueFiles:
- values.yaml
destination:
server: https://kubernetes.default.svc
namespace: monitoring
syncPolicy:
automated:
prune: true
selfHeal: true
syncOptions:
- CreateNamespace=false
- ServerSideApply=true
- SkipDryRunOnMissingResource=true

View File

@ -0,0 +1,138 @@
kube-prometheus-stack:
# Prometheus Operator
prometheusOperator:
enabled: true
resources:
requests:
cpu: 50m
memory: 128Mi
limits:
memory: 256Mi
# Prometheus configuration
prometheus:
enabled: true
prometheusSpec:
# Retention
retention: 24h
retentionSize: 15GB
# Resources
resources:
requests:
cpu: 200m
memory: 1Gi
limits:
memory: 2Gi
# Storage
storageSpec:
volumeClaimTemplate:
spec:
storageClassName: ceph-block
accessModes: ["ReadWriteOnce"]
resources:
requests:
storage: 20Gi
# Thanos sidecar configuration
thanos:
image: quay.io/thanos/thanos:v0.37.2
version: v0.37.2
objectStorageConfig:
name: thanos-objstore-secret
key: objstore.yml
# External labels for Thanos
externalLabels:
cluster: homelab
prometheus: monitoring/prometheus
# Replicas
replicas: 1
replicaExternalLabelName: prometheus_replica
# Service monitors
serviceMonitorSelectorNilUsesHelmValues: false
podMonitorSelectorNilUsesHelmValues: false
ruleSelectorNilUsesHelmValues: false
# Additional scrape configs
additionalScrapeConfigs: []
# Alertmanager
alertmanager:
enabled: true
alertmanagerSpec:
replicas: 1
storage:
volumeClaimTemplate:
spec:
storageClassName: ceph-block
accessModes: ["ReadWriteOnce"]
resources:
requests:
storage: 5Gi
resources:
requests:
cpu: 50m
memory: 128Mi
limits:
memory: 256Mi
# Grafana (disabled - using separate Grafana deployment)
grafana:
enabled: false
# Node Exporter
nodeExporter:
enabled: true
resources:
requests:
cpu: 50m
memory: 64Mi
limits:
memory: 128Mi
# Kube State Metrics
kubeStateMetrics:
enabled: true
resources:
requests:
cpu: 50m
memory: 128Mi
limits:
memory: 256Mi
# Default rules
defaultRules:
create: true
rules:
alertmanager: true
etcd: false
configReloaders: true
general: true
k8s: true
kubeApiserverAvailability: true
kubeApiserverBurnrate: true
kubeApiserverHistogram: true
kubeApiserverSlos: true
kubeControllerManager: true
kubelet: true
kubeProxy: true
kubePrometheusGeneral: true
kubePrometheusNodeRecording: true
kubernetesApps: true
kubernetesResources: true
kubernetesStorage: true
kubernetesSystem: true
kubeSchedulerAlerting: true
kubeSchedulerRecording: true
kubeStateMetrics: true
network: true
node: true
nodeExporterAlerting: true
nodeExporterRecording: true
prometheus: true
prometheusOperator: true

View File

@ -0,0 +1,11 @@
apiVersion: v2
name: thanos
description: Thanos distributed metrics wrapper chart
type: application
version: 1.0.0
appVersion: "0.40.1"
dependencies:
- name: thanos
version: 1.22.0
repository: oci://ghcr.io/stevehipwell/helm-charts

View File

@ -0,0 +1,30 @@
apiVersion: argoproj.io/v1alpha1
kind: Application
metadata:
name: thanos
namespace: argocd
annotations:
argocd.argoproj.io/sync-wave: "1"
finalizers:
- resources-finalizer.argocd.argoproj.io
spec:
project: default
source:
repoURL: https://git.mvzijl.nl/marco/veda.git
targetRevision: applicationset-rewrite
path: apps/monitoring/thanos
helm:
releaseName: thanos
valueFiles:
- values.yaml
destination:
server: https://kubernetes.default.svc
namespace: monitoring
syncPolicy:
automated:
prune: true
selfHeal: true
syncOptions:
- CreateNamespace=true
- ServerSideApply=true
- SkipDryRunOnMissingResource=true

View File

@ -0,0 +1,4 @@
{{- range .Values.extraObjects }}
---
{{ toYaml . }}
{{- end }}

View File

@ -0,0 +1,130 @@
thanos:
# Object storage configuration
objstoreConfig:
create: false # We create the secret via extraObjects
name: thanos-objstore-secret
key: objstore.yml
# Image configuration
image:
registry: quay.io
repository: thanos/thanos
tag: v0.40.1
# Query component
query:
enabled: true
replicaCount: 2
resources:
requests:
cpu: 100m
memory: 256Mi
limits:
memory: 512Mi
stores:
- dnssrv+_grpc._tcp.thanos-storegateway.monitoring.svc.cluster.local
- dnssrv+_grpc._tcp.thanos-receive.monitoring.svc.cluster.local
# Query Frontend
queryFrontend:
enabled: true
replicaCount: 1
resources:
requests:
cpu: 50m
memory: 128Mi
limits:
memory: 256Mi
# Store Gateway
storegateway:
enabled: true
replicaCount: 1
persistence:
enabled: true
storageClass: ceph-block
size: 10Gi
resources:
requests:
cpu: 100m
memory: 512Mi
limits:
memory: 1Gi
# Compactor
compactor:
enabled: true
persistence:
enabled: true
storageClass: ceph-block
size: 10Gi
retentionResolutionRaw: 14d
retentionResolution5m: 90d
retentionResolution1h: 2y
resources:
requests:
cpu: 100m
memory: 512Mi
limits:
memory: 1Gi
extraFlags:
- --deduplication.replica-label=prometheus_replica
- --deduplication.replica-label=replica
- --downsampling.disable=false
- --compact.enable-vertical-compaction
# Receive (for remote write from Prometheus)
receive:
enabled: true
replicaCount: 1
persistence:
enabled: true
storageClass: ceph-block
size: 20Gi
resources:
requests:
cpu: 100m
memory: 512Mi
limits:
memory: 1Gi
# Metrics and caching
# Note: Memcached configuration would be added here if using external caching
# Metrics
metrics:
enabled: true
serviceMonitor:
enabled: true
# S3 Bucket and credentials provisioning
extraObjects:
# ObjectBucketClaim for Thanos metrics
- apiVersion: objectbucket.io/v1alpha1
kind: ObjectBucketClaim
metadata:
name: thanos-metrics
namespace: monitoring
spec:
bucketName: thanos-metrics
storageClassName: ceph-bucket
additionalConfig:
maxSize: "500Gi"
# Secret with S3 credentials (will be populated by Rook)
# This is a placeholder - actual credentials come from the OBC
- apiVersion: v1
kind: Secret
metadata:
name: thanos-objstore-secret
namespace: monitoring
type: Opaque
stringData:
objstore.yml: |-
type: S3
config:
bucket: thanos-metrics
endpoint: rook-ceph-rgw-ceph-objectstore.rook-ceph.svc:80
insecure: true
access_key: ${AWS_ACCESS_KEY_ID}
secret_key: ${AWS_SECRET_ACCESS_KEY}

View File

View File

209
apps/validate-manifests.sh Executable file
View File

@ -0,0 +1,209 @@
#!/bin/bash
# Kubernetes/Helm Configuration Validator
# Validates all applications without deploying them
set -e
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# Counters
TOTAL=0
PASSED=0
FAILED=0
echo -e "${BLUE}=== Kubernetes Configuration Validator ===${NC}\n"
# Function to validate a Helm chart
validate_helm_chart() {
local app_path=$1
local app_name=$(basename "$app_path")
local namespace=$2
TOTAL=$((TOTAL + 1))
echo -e "${YELLOW}[$TOTAL] Validating: $app_name (namespace: $namespace)${NC}"
# Check if Chart.yaml exists
if [ ! -f "$app_path/Chart.yaml" ]; then
echo -e "${RED} ✗ No Chart.yaml found${NC}\n"
FAILED=$((FAILED + 1))
return 1
fi
# Check if dependencies are built (build to temp location if not)
if [ -f "$app_path/Chart.yaml" ] && grep -q "dependencies:" "$app_path/Chart.yaml"; then
if [ ! -d "$app_path/charts" ]; then
echo " → Dependencies not built - building to temporary location..."
# Create temp directory
local temp_dir=$(mktemp -d)
trap "rm -rf $temp_dir" EXIT
# Copy chart to temp location
cp -r "$app_path" "$temp_dir/"
local temp_chart="$temp_dir/$(basename "$app_path")"
# Build dependencies in temp location
if ! (cd "$temp_chart" && helm dependency build > /dev/null 2>&1); then
echo -e "${RED} ✗ Failed to build dependencies${NC}\n"
FAILED=$((FAILED + 1))
return 1
fi
# Use temp location for validation
app_path="$temp_chart"
fi
fi
# Lint the chart
echo " → Running Helm lint..."
if ! (cd "$app_path" && helm lint . 2>&1 | grep -q "0 chart(s) failed"); then
echo -e "${RED} ✗ Helm lint failed${NC}"
(cd "$app_path" && helm lint .)
echo ""
FAILED=$((FAILED + 1))
return 1
fi
# Template the chart
echo " → Rendering Helm templates..."
if ! (cd "$app_path" && helm template "$app_name" . --namespace "$namespace" --validate > /dev/null 2>&1); then
echo -e "${RED} ✗ Helm template failed${NC}"
(cd "$app_path" && helm template "$app_name" . --namespace "$namespace" --validate 2>&1 | head -20)
echo ""
FAILED=$((FAILED + 1))
return 1
fi
# Validate with kubeval (if installed)
if command -v kubeval &> /dev/null; then
echo " → Validating manifests with kubeval..."
if ! (cd "$app_path" && helm template "$app_name" . --namespace "$namespace" | kubeval --ignore-missing-schemas > /dev/null 2>&1); then
echo -e "${YELLOW} ⚠ Kubeval warnings (may be acceptable)${NC}"
fi
fi
# Check for common issues
echo " → Checking for common issues..."
local rendered=$(cd "$app_path" && helm template "$app_name" . --namespace "$namespace" 2>&1)
# Check for placeholder secrets
if echo "$rendered" | grep -qi "changeme\|placeholder\|CHANGE_ME\|TODO"; then
echo -e "${YELLOW} ⚠ Warning: Found placeholder values (changeme/placeholder/TODO)${NC}"
fi
# Check for resource requests/limits
if ! echo "$rendered" | grep -q "resources:"; then
echo -e "${YELLOW} ⚠ Warning: No resource requests/limits found${NC}"
fi
echo -e "${GREEN} ✓ Validation passed${NC}\n"
PASSED=$((PASSED + 1))
return 0
}
# Function to validate an ArgoCD Application manifest
validate_argocd_app() {
local app_file=$1
local app_name=$(basename "$(dirname "$app_file")")
TOTAL=$((TOTAL + 1))
echo -e "${YELLOW}[$TOTAL] Validating ArgoCD Application: $app_name${NC}"
# Check YAML syntax
if ! python3 -c "import yaml; yaml.safe_load(open('$app_file'))" 2>/dev/null; then
echo -e "${RED} ✗ Invalid YAML syntax${NC}\n"
FAILED=$((FAILED + 1))
return 1
fi
# Check for required fields
local missing_fields=()
grep -q "kind: Application" "$app_file" || missing_fields+=("kind: Application")
grep -q "metadata:" "$app_file" || missing_fields+=("metadata")
grep -q "spec:" "$app_file" || missing_fields+=("spec")
grep -q "source:" "$app_file" || missing_fields+=("source")
grep -q "destination:" "$app_file" || missing_fields+=("destination")
if [ ${#missing_fields[@]} -gt 0 ]; then
echo -e "${RED} ✗ Missing required fields: ${missing_fields[*]}${NC}\n"
FAILED=$((FAILED + 1))
return 1
fi
echo -e "${GREEN} ✓ Validation passed${NC}\n"
PASSED=$((PASSED + 1))
return 0
}
# Main validation flow
echo -e "${BLUE}Validating Monitoring Stack...${NC}\n"
# Thanos
if [ -d "apps/monitoring/thanos" ]; then
validate_helm_chart "apps/monitoring/thanos" "monitoring"
validate_argocd_app "apps/monitoring/thanos/application.yaml"
fi
# Prometheus
if [ -d "apps/monitoring/prometheus" ]; then
validate_helm_chart "apps/monitoring/prometheus" "monitoring"
validate_argocd_app "apps/monitoring/prometheus/application.yaml"
fi
# Grafana
if [ -d "apps/monitoring/grafana" ]; then
validate_helm_chart "apps/monitoring/grafana" "monitoring"
validate_argocd_app "apps/monitoring/grafana/application.yaml"
fi
echo -e "${BLUE}Validating Logging Stack...${NC}\n"
# Loki
if [ -d "apps/logging/loki" ]; then
validate_helm_chart "apps/logging/loki" "logging"
validate_argocd_app "apps/logging/loki/application.yaml"
fi
# Promtail
if [ -d "apps/logging/promtail" ]; then
validate_helm_chart "apps/logging/promtail" "logging"
validate_argocd_app "apps/logging/promtail/application.yaml"
fi
# Additional apps (if they exist)
echo -e "${BLUE}Validating Other Applications...${NC}\n"
for app_dir in apps/*/; do
app_name=$(basename "$app_dir")
if [ -f "$app_dir/Chart.yaml" ] && [ -f "$app_dir/application.yaml" ]; then
# Skip if already validated
if [[ "$app_name" != "monitoring" ]] && [[ "$app_name" != "logging" ]]; then
# Try to extract namespace from application.yaml
namespace=$(grep -A 10 "destination:" "$app_dir/application.yaml" | grep "namespace:" | head -1 | awk '{print $2}')
[ -z "$namespace" ] && namespace="default"
validate_helm_chart "$app_dir" "$namespace"
validate_argocd_app "$app_dir/application.yaml"
fi
fi
done
# Summary
echo -e "${BLUE}=== Validation Summary ===${NC}"
echo -e "Total checks: $TOTAL"
echo -e "${GREEN}Passed: $PASSED${NC}"
echo -e "${RED}Failed: $FAILED${NC}\n"
if [ $FAILED -eq 0 ]; then
echo -e "${GREEN}✓ All validations passed!${NC}"
exit 0
else
echo -e "${RED}✗ Some validations failed. Please review the errors above.${NC}"
exit 1
fi