From cac51f441606af0ba5907889d6c9b028d960f7b4 Mon Sep 17 00:00:00 2001 From: Marco van Zijl Date: Sun, 9 Nov 2025 17:12:33 +0100 Subject: [PATCH] Added monitoring and loggin stack --- apps/argocd/templates/httproute.yaml | 0 .../ceph/cluster/templates/extra-objects.yaml | 0 apps/cert-manager/templates/certificates.yaml | 0 .../templates/cluster-issuers.yaml | 0 .../cert-manager/templates/extra-objects.yaml | 0 apps/cloudnative-pg-plugin/Chart.yaml | 0 apps/cloudnative-pg-plugin/MIRROR.md | 132 ++++++++ apps/cloudnative-pg-plugin/README.md | 301 ++++++++++++++++++ apps/cloudnative-pg-plugin/application.yaml | 32 ++ apps/cloudnative-pg/CONFIGURATION.md | 0 apps/cloudnative-pg/README.md | 128 +------- .../templates/extra-objects.yaml | 0 apps/cloudnative-pg/values.yaml | 27 +- apps/harbor/README.md | 0 apps/harbor/templates/extra-objects.yaml | 0 apps/logging/loki/Chart.yaml | 11 + apps/logging/loki/application.yaml | 30 ++ .../logging/loki/templates/extra-objects.yaml | 4 + apps/logging/loki/values.yaml | 152 +++++++++ apps/logging/promtail/Chart.yaml | 11 + apps/logging/promtail/application.yaml | 29 ++ apps/logging/promtail/values.yaml | 163 ++++++++++ apps/monitoring/grafana/Chart.yaml | 11 + apps/monitoring/grafana/application.yaml | 38 +++ .../grafana/templates/extra-objects.yaml | 4 + apps/monitoring/grafana/values.yaml | 177 ++++++++++ apps/monitoring/prometheus/Chart.yaml | 11 + apps/monitoring/prometheus/application.yaml | 30 ++ apps/monitoring/prometheus/values.yaml | 138 ++++++++ apps/monitoring/thanos/Chart.yaml | 11 + apps/monitoring/thanos/application.yaml | 30 ++ .../thanos/templates/extra-objects.yaml | 4 + apps/monitoring/thanos/values.yaml | 130 ++++++++ .../post-install/dashboard-httproute.yaml | 0 apps/traefik/post-install/gateway.yaml | 0 .../traefik/post-install/reference-grant.yaml | 0 .../templates/dashboard-httproute.yaml | 0 apps/traefik/templates/gateway.yaml | 0 apps/traefik/templates/reference-grant.yaml | 0 apps/validate-manifests.sh | 209 ++++++++++++ 40 files changed, 1665 insertions(+), 148 deletions(-) create mode 100644 apps/argocd/templates/httproute.yaml create mode 100644 apps/ceph/cluster/templates/extra-objects.yaml create mode 100644 apps/cert-manager/templates/certificates.yaml create mode 100644 apps/cert-manager/templates/cluster-issuers.yaml create mode 100644 apps/cert-manager/templates/extra-objects.yaml create mode 100644 apps/cloudnative-pg-plugin/Chart.yaml create mode 100644 apps/cloudnative-pg-plugin/MIRROR.md create mode 100644 apps/cloudnative-pg-plugin/README.md create mode 100644 apps/cloudnative-pg-plugin/application.yaml create mode 100644 apps/cloudnative-pg/CONFIGURATION.md create mode 100644 apps/cloudnative-pg/templates/extra-objects.yaml create mode 100644 apps/harbor/README.md create mode 100644 apps/harbor/templates/extra-objects.yaml create mode 100644 apps/logging/loki/Chart.yaml create mode 100644 apps/logging/loki/application.yaml create mode 100644 apps/logging/loki/templates/extra-objects.yaml create mode 100644 apps/logging/loki/values.yaml create mode 100644 apps/logging/promtail/Chart.yaml create mode 100644 apps/logging/promtail/application.yaml create mode 100644 apps/logging/promtail/values.yaml create mode 100644 apps/monitoring/grafana/Chart.yaml create mode 100644 apps/monitoring/grafana/application.yaml create mode 100644 apps/monitoring/grafana/templates/extra-objects.yaml create mode 100644 apps/monitoring/grafana/values.yaml create mode 100644 apps/monitoring/prometheus/Chart.yaml create mode 100644 apps/monitoring/prometheus/application.yaml create mode 100644 apps/monitoring/prometheus/values.yaml create mode 100644 apps/monitoring/thanos/Chart.yaml create mode 100644 apps/monitoring/thanos/application.yaml create mode 100644 apps/monitoring/thanos/templates/extra-objects.yaml create mode 100644 apps/monitoring/thanos/values.yaml create mode 100644 apps/traefik/post-install/dashboard-httproute.yaml create mode 100644 apps/traefik/post-install/gateway.yaml create mode 100644 apps/traefik/post-install/reference-grant.yaml create mode 100644 apps/traefik/templates/dashboard-httproute.yaml create mode 100644 apps/traefik/templates/gateway.yaml create mode 100644 apps/traefik/templates/reference-grant.yaml create mode 100755 apps/validate-manifests.sh diff --git a/apps/argocd/templates/httproute.yaml b/apps/argocd/templates/httproute.yaml new file mode 100644 index 0000000..e69de29 diff --git a/apps/ceph/cluster/templates/extra-objects.yaml b/apps/ceph/cluster/templates/extra-objects.yaml new file mode 100644 index 0000000..e69de29 diff --git a/apps/cert-manager/templates/certificates.yaml b/apps/cert-manager/templates/certificates.yaml new file mode 100644 index 0000000..e69de29 diff --git a/apps/cert-manager/templates/cluster-issuers.yaml b/apps/cert-manager/templates/cluster-issuers.yaml new file mode 100644 index 0000000..e69de29 diff --git a/apps/cert-manager/templates/extra-objects.yaml b/apps/cert-manager/templates/extra-objects.yaml new file mode 100644 index 0000000..e69de29 diff --git a/apps/cloudnative-pg-plugin/Chart.yaml b/apps/cloudnative-pg-plugin/Chart.yaml new file mode 100644 index 0000000..e69de29 diff --git a/apps/cloudnative-pg-plugin/MIRROR.md b/apps/cloudnative-pg-plugin/MIRROR.md new file mode 100644 index 0000000..58ba19d --- /dev/null +++ b/apps/cloudnative-pg-plugin/MIRROR.md @@ -0,0 +1,132 @@ +# Mirroring CloudNativePG Barman Plugin + +## Setup Mirror Repository + +1. **Clone the upstream repository:** + ```bash + cd /tmp + git clone --mirror https://github.com/cloudnative-pg/plugin-barman-cloud.git + cd plugin-barman-cloud.git + ``` + +2. **Push to your Git server:** + ```bash + # Create repo on your Git server first (git.mvzijl.nl) + # Then push: + git push --mirror https://git.mvzijl.nl/marco/plugin-barman-cloud.git + ``` + +3. **Set up periodic sync (optional):** + ```bash + # Create a script to sync weekly + cat > /usr/local/bin/sync-barman-plugin.sh <<'EOF' + #!/bin/bash + cd /var/git/mirrors/plugin-barman-cloud.git + git fetch --prune origin + git push --mirror https://git.mvzijl.nl/marco/plugin-barman-cloud.git + EOF + + chmod +x /usr/local/bin/sync-barman-plugin.sh + + # Add to cron (weekly on Sunday at 2 AM) + echo "0 2 * * 0 /usr/local/bin/sync-barman-plugin.sh" | crontab - + ``` + +## Update Application Reference + +After mirroring, update the application.yaml to use your mirror: + +```yaml +spec: + source: + repoURL: https://git.mvzijl.nl/marco/plugin-barman-cloud.git + targetRevision: main # or specific tag like v1.0.0 + path: deployments/manifests +``` + +## Version Pinning Strategy + +Instead of tracking `main`, pin to specific releases: + +```yaml +spec: + source: + repoURL: https://git.mvzijl.nl/marco/plugin-barman-cloud.git + targetRevision: v1.0.0 # Pin to specific version + path: deployments/manifests +``` + +This gives you: +- ✅ Predictable deployments +- ✅ Controlled updates +- ✅ Rollback capability + +## Update Process + +When a new version is released: + +1. **Check upstream for updates:** + ```bash + cd /var/git/mirrors/plugin-barman-cloud.git + git fetch origin + git tag -l + ``` + +2. **Review changes:** + ```bash + git log HEAD..origin/main --oneline + git diff HEAD..origin/main deployments/manifests/ + ``` + +3. **Sync to your mirror:** + ```bash + git push --mirror https://git.mvzijl.nl/marco/plugin-barman-cloud.git + ``` + +4. **Update application.yaml:** + ```yaml + targetRevision: v1.1.0 # Update to new version + ``` + +5. **Test and deploy:** + ```bash + git add apps/cloudnative-pg-plugin/application.yaml + git commit -m "Update barman plugin to v1.1.0" + git push + ``` + +## Monitoring Upstream + +Subscribe to releases: +- GitHub: Watch → Custom → Releases only +- RSS: `https://github.com/cloudnative-pg/plugin-barman-cloud/releases.atom` + +## Alternative: Subtree Approach + +Instead of mirroring, you could use git subtree: + +```bash +cd /Users/marco/Documents/Hobby/Veda/talos +git subtree add --prefix vendor/plugin-barman-cloud \ + https://github.com/cloudnative-pg/plugin-barman-cloud.git main --squash + +# Then reference in application: +# path: vendor/plugin-barman-cloud/deployments/manifests +``` + +Update when needed: +```bash +git subtree pull --prefix vendor/plugin-barman-cloud \ + https://github.com/cloudnative-pg/plugin-barman-cloud.git main --squash +``` + +## Recommended Approach + +For your setup, I recommend: + +1. **Mirror to your Git server** at `git.mvzijl.nl/marco/plugin-barman-cloud` +2. **Pin to specific versions** (not `main`) +3. **Review updates** before applying +4. **Set up monitoring** for new releases + +This gives you the best balance of control and maintainability. diff --git a/apps/cloudnative-pg-plugin/README.md b/apps/cloudnative-pg-plugin/README.md new file mode 100644 index 0000000..ad6e63f --- /dev/null +++ b/apps/cloudnative-pg-plugin/README.md @@ -0,0 +1,301 @@ +# CloudNativePG Barman-Cloud Plugin + +## Overview + +The Barman Cloud Plugin provides object storage backup capabilities for CloudNativePG using the Barman toolset. + +**Important**: As of CloudNativePG v1.26+, the native `barmanObjectStore` backup method is **deprecated**. You should use this plugin instead. + +## Why This Plugin is Required + +From the CloudNativePG 1.27 documentation: + +> Starting with version 1.26, native backup and recovery capabilities are being progressively phased out of the core operator and moved to official CNPG-I plugins. + +The built-in barman integration (`method: barmanObjectStore`) is deprecated and will be removed in future versions. This plugin provides the official replacement. + +## What This Plugin Provides + +- ✅ **WAL archiving** to S3-compatible object stores +- ✅ **Base backups** with compression and encryption +- ✅ **Point-in-time recovery (PITR)** +- ✅ **Retention policies** for automated cleanup +- ✅ **Backup from standby** servers +- ✅ **Support for multiple storage backends**: S3, Azure Blob, GCS, MinIO, Ceph S3 (RGW) + +## Installation + +This application deploys the plugin to the `cnpg-system` namespace where the CloudNativePG operator runs. + +The plugin will be available for all PostgreSQL clusters managed by CloudNativePG. + +## Configuration in PostgreSQL Clusters + +### Using the Plugin (New Method) + +```yaml +apiVersion: postgresql.cnpg.io/v1 +kind: Cluster +metadata: + name: my-cluster +spec: + backup: + target: prefer-standby + + # Use the plugin method (required for v1.26+) + method: plugin + + # Plugin configuration + pluginConfiguration: + name: barman-cloud.cloudnative-pg.io + + # S3 configuration + barmanObjectStore: + destinationPath: s3://postgres-backups/ + endpointURL: http://rook-ceph-rgw-ceph-objectstore.rook-ceph.svc:80 + + # Credentials + s3Credentials: + accessKeyId: + name: backup-credentials + key: ACCESS_KEY_ID + secretAccessKey: + name: backup-credentials + key: ACCESS_SECRET_KEY + + # Compression and parallelism + data: + compression: bzip2 + jobs: 2 + immediateCheckpoint: true + + wal: + compression: bzip2 + maxParallel: 2 + + # Retention policy + retentionPolicy: "30d" + + # Tags for organization + tags: + environment: "production" + cluster: "my-cluster" +``` + +### Old Method (Deprecated) + +```yaml +# ❌ DON'T USE - This is deprecated +spec: + backup: + method: barmanObjectStore # Deprecated! + barmanObjectStore: + # ... config +``` + +## WAL Archiving + +The plugin also handles WAL archiving. Configure it at the cluster level: + +```yaml +apiVersion: postgresql.cnpg.io/v1 +kind: Cluster +metadata: + name: my-cluster +spec: + backup: + # Backup configuration (as above) + ... + + # WAL archiving uses the same plugin configuration + # Automatically enabled when backup is configured +``` + +## Scheduled Backups + +Create scheduled backups using the plugin: + +```yaml +apiVersion: postgresql.cnpg.io/v1 +kind: ScheduledBackup +metadata: + name: daily-backup +spec: + schedule: "0 0 2 * * *" # 2 AM daily + backupOwnerReference: self + cluster: + name: my-cluster + + # Use plugin method + method: plugin + + # Plugin configuration (or inherits from cluster) + pluginConfiguration: + name: barman-cloud.cloudnative-pg.io +``` + +## On-Demand Backups + +Trigger manual backups: + +```yaml +apiVersion: postgresql.cnpg.io/v1 +kind: Backup +metadata: + name: manual-backup +spec: + cluster: + name: my-cluster + + method: plugin + + pluginConfiguration: + name: barman-cloud.cloudnative-pg.io +``` + +Or use kubectl: + +```bash +kubectl cnpg backup my-cluster --method plugin +``` + +## Retention Policies + +The plugin supports advanced retention policies: + +```yaml +pluginConfiguration: + barmanObjectStore: + retentionPolicy: "30d" # Keep backups for 30 days + # or + # retentionPolicy: "7 days" + # retentionPolicy: "4 weeks" + # retentionPolicy: "3 months" +``` + +## Supported Storage Backends + +### AWS S3 +```yaml +destinationPath: s3://bucket-name/ +# endpointURL not needed for AWS S3 +``` + +### Ceph S3 (RGW) - Your Setup +```yaml +destinationPath: s3://postgres-backups/ +endpointURL: http://rook-ceph-rgw-ceph-objectstore.rook-ceph.svc:80 +``` + +### Azure Blob Storage +```yaml +destinationPath: https://storageaccount.blob.core.windows.net/container/ +``` + +### Google Cloud Storage +```yaml +destinationPath: gs://bucket-name/ +``` + +### MinIO +```yaml +destinationPath: s3://bucket-name/ +endpointURL: http://minio:9000 +``` + +## Verification + +After deploying, verify the plugin is running: + +```bash +# Check plugin deployment +kubectl get deployment -n cnpg-system | grep plugin + +# Check plugin pods +kubectl get pods -n cnpg-system -l app=barman-cloud-plugin + +# Verify plugin is registered +kubectl get configmap -n cnpg-system cnpg-plugin-registry -o yaml +``` + +## Troubleshooting + +### Plugin Not Found + +If you see errors like "plugin not found": + +```bash +# Check if plugin is deployed +kubectl get pods -n cnpg-system -l app=barman-cloud-plugin + +# Check operator logs +kubectl logs -n cnpg-system -l app.kubernetes.io/name=cloudnative-pg +``` + +### Backup Failures + +```bash +# Check backup status +kubectl get backup -n + +# Check backup logs +kubectl describe backup -n + +# Check PostgreSQL pod logs +kubectl logs -n | grep -i backup +``` + +### WAL Archiving Issues + +```bash +# Check WAL archive status +kubectl exec -it -n -- \ + psql -c "SELECT * FROM pg_stat_archiver;" + +# Check plugin logs +kubectl logs -n cnpg-system -l app=barman-cloud-plugin +``` + +## Migration from Built-in to Plugin + +If you're migrating from the deprecated `barmanObjectStore` method: + +1. **Deploy this plugin application** +2. **Update your Cluster resource**: + ```yaml + spec: + backup: + method: plugin # Change from barmanObjectStore + pluginConfiguration: + name: barman-cloud.cloudnative-pg.io + barmanObjectStore: + # Keep same configuration + ``` +3. **Existing backups remain accessible** - the plugin can read backups created by the built-in method + +## Best Practices + +1. ✅ **Always use the plugin** for CloudNativePG v1.26+ +2. ✅ **Configure retention policies** to manage storage costs +3. ✅ **Enable backup from standby** to reduce primary load +4. ✅ **Use compression** (bzip2) to reduce storage usage +5. ✅ **Set up scheduled backups** for automated protection +6. ✅ **Test recovery procedures** regularly +7. ✅ **Monitor backup status** with Prometheus metrics +8. ✅ **Tag backups** for easy identification and filtering + +## Next Steps + +1. Deploy this application: `git add . && git commit && git push` +2. Wait for ArgoCD to sync +3. Update your PostgreSQL Cluster to use `method: plugin` +4. Create an S3 bucket for backups (ObjectBucketClaim) +5. Configure backup credentials +6. Test with an on-demand backup + +## Additional Resources + +- [Barman Cloud Plugin Documentation](https://cloudnative-pg.io/plugin-barman-cloud/) +- [CloudNativePG Backup Guide](https://cloudnative-pg.io/documentation/1.27/backup/) +- [CNPG-I Plugin Architecture](https://cloudnative-pg.io/documentation/1.27/cnpg_i/) + diff --git a/apps/cloudnative-pg-plugin/application.yaml b/apps/cloudnative-pg-plugin/application.yaml new file mode 100644 index 0000000..7ca28bb --- /dev/null +++ b/apps/cloudnative-pg-plugin/application.yaml @@ -0,0 +1,32 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: cloudnative-pg-plugin + namespace: argocd + annotations: + argocd.argoproj.io/sync-wave: "0" + finalizers: + - resources-finalizer.argocd.argoproj.io +spec: + project: default + source: + repoURL: https://git.mvzijl.nl/marco/plugin-barman-cloud.git + targetRevision: 0.9.0 + path: deployments/manifests + destination: + server: https://kubernetes.default.svc + namespace: cnpg-system + syncPolicy: + automated: + prune: true + selfHeal: true + syncOptions: + - CreateNamespace=false + - ServerSideApply=true + # Ensure operator is healthy before deploying plugin + retry: + limit: 5 + backoff: + duration: 5s + factor: 2 + maxDuration: 3m diff --git a/apps/cloudnative-pg/CONFIGURATION.md b/apps/cloudnative-pg/CONFIGURATION.md new file mode 100644 index 0000000..e69de29 diff --git a/apps/cloudnative-pg/README.md b/apps/cloudnative-pg/README.md index 2736763..6380d09 100644 --- a/apps/cloudnative-pg/README.md +++ b/apps/cloudnative-pg/README.md @@ -47,98 +47,13 @@ CloudNativePG is a Kubernetes operator that manages PostgreSQL clusters using Ku ### Example Cluster (Commented Out) -The `values.yaml` includes a commented example cluster configuration with: -- **Storage**: `local-path` StorageClass (for development) -- **Backup**: Barman-cloud plugin with S3 (Ceph RGW) backend -- **Note**: See "Storage Considerations" section below - -## ⚠️ Storage Considerations - -### Local Path vs Ceph Block - -The example cluster uses `local-path` StorageClass, which is suitable for: -- ✅ **Development/Testing**: Quick setup, no Ceph dependency -- ✅ **Single-node scenarios**: When HA isn't required -- ✅ **Learning/Experimentation**: Testing PostgreSQL features - -**For production use, change to `ceph-block`:** - -```yaml -storage: - storageClass: ceph-block # Instead of local-path - size: 50Gi -``` - -### Why Ceph Block for Production? - -| Feature | local-path | ceph-block | -|---------|-----------|------------| -| **High Availability** | ❌ No | ✅ Yes | -| **Data Replication** | ❌ No | ✅ 2x copies | -| **Pod Mobility** | ❌ Pinned to node | ✅ Can move | -| **Snapshots** | ❌ No | ✅ Yes | -| **Auto Resize** | ❌ No | ✅ Yes | -| **Node Failure** | ❌ Data unavailable | ✅ Survives | - -### Hybrid Approach (Recommended for Dev) - -Even with local-path storage, the S3 backup provides safety: -- **Primary storage**: local-path (fast, simple) -- **Backups**: Ceph S3 (safe, replicated, off-node) -- **Recovery**: Restore from S3 if node fails - -This gives you: -- ✅ Point-in-time recovery -- ✅ Off-node backup storage -- ✅ Disaster recovery capability -- ✅ Fast local performance -- ⚠️ But no automatic HA - -## Barman-Cloud Backup Plugin - -CloudNativePG uses the modern barman-cloud toolset for backups. - -### Configuration Features: - -```yaml -backup: - barmanObjectStore: - # Parallel processing - data: - compression: bzip2 - jobs: 2 # Parallel compression threads - wal: - compression: bzip2 - maxParallel: 2 # Parallel WAL uploads - - # Metadata tags - tags: - environment: "development" - managed-by: "cloudnative-pg" - - # Backup lineage tracking - historyTags: - environment: "development" -``` - -### Plugin Benefits: -- ✅ **Better S3 compatibility**: Works with all S3-compatible stores -- ✅ **Improved parallelism**: Faster backups for large databases -- ✅ **Enhanced error handling**: Better retry logic -- ✅ **Cloud-native design**: Optimized for object storage -- ✅ **Metadata tagging**: Better backup organization - -### Backup Strategy: -1. **Continuous WAL archiving**: Real-time transaction logs to S3 -2. **Scheduled full backups**: Complete database snapshots -3. **Point-in-time recovery**: Restore to any timestamp -4. **Retention policies**: Automatic cleanup of old backups +The `values.yaml` includes a commented example cluster configuration. See "Creating Your First Cluster" below. ## Creating Your First Cluster -### Option 1: Using extraObjects in values.yaml (Development) +### Option 1: Using extraObjects in values.yaml -Uncomment the `extraObjects` section in `values.yaml` for a development cluster: +Uncomment the `extraObjects` section in `values.yaml` and customize: ```yaml extraObjects: @@ -149,36 +64,14 @@ extraObjects: namespace: cnpg-system spec: instances: 2 # 1 primary + 1 replica - - # Development: local-path for fast local storage storage: size: 50Gi - storageClass: local-path - - # Backup to Ceph S3 for safety - backup: - retentionPolicy: "30d" - barmanObjectStore: - destinationPath: s3://postgres-backups/ - endpointURL: http://rook-ceph-rgw-ceph-objectstore.rook-ceph.svc:80 - s3Credentials: - accessKeyId: - name: postgres-backup-credentials - key: ACCESS_KEY_ID - secretAccessKey: - name: postgres-backup-credentials - key: ACCESS_SECRET_KEY - data: - compression: bzip2 - jobs: 2 - wal: - compression: bzip2 - maxParallel: 2 + storageClass: ceph-block ``` -### Option 2: Separate Application (Production) +### Option 2: Separate Application -For production, create a separate ArgoCD Application with ceph-block storage: +For production, create a separate ArgoCD Application for each database cluster: ```bash mkdir -p apps/databases/my-app-db @@ -199,7 +92,6 @@ spec: max_connections: "200" shared_buffers: "256MB" - # Production: ceph-block for HA storage: size: 100Gi storageClass: ceph-block @@ -207,7 +99,6 @@ spec: monitoring: enablePodMonitor: true - # Barman-cloud backup configuration backup: retentionPolicy: "30d" barmanObjectStore: @@ -222,13 +113,6 @@ spec: key: ACCESS_SECRET_KEY data: compression: bzip2 - jobs: 2 # Parallel compression - wal: - compression: bzip2 - maxParallel: 2 # Parallel WAL uploads - tags: - environment: "production" - application: "my-app" wal: compression: bzip2 ``` diff --git a/apps/cloudnative-pg/templates/extra-objects.yaml b/apps/cloudnative-pg/templates/extra-objects.yaml new file mode 100644 index 0000000..e69de29 diff --git a/apps/cloudnative-pg/values.yaml b/apps/cloudnative-pg/values.yaml index d8bbde5..3470ce5 100644 --- a/apps/cloudnative-pg/values.yaml +++ b/apps/cloudnative-pg/values.yaml @@ -41,26 +41,14 @@ cloudnative-pg: # effective_io_concurrency: "300" # monitoring: # enablePodMonitor: true -# -# # Use local-path-provisioner for storage # storage: # size: 50Gi -# storageClass: local-path -# -# # Backup configuration using new plugin system +# storageClass: ceph-block # backup: # retentionPolicy: "30d" -# -# # Volume for barman backups (uses same StorageClass as main storage) -# volumeSnapshot: -# className: local-path -# -# # S3 backup using barman-cloud plugin # barmanObjectStore: # destinationPath: s3://postgres-backups/ # endpointURL: http://rook-ceph-rgw-ceph-objectstore.rook-ceph.svc:80 -# -# # S3 credentials reference # s3Credentials: # accessKeyId: # name: postgres-backup-credentials @@ -68,20 +56,7 @@ cloudnative-pg: # secretAccessKey: # name: postgres-backup-credentials # key: ACCESS_SECRET_KEY -# -# # Compression settings # data: # compression: bzip2 -# jobs: 2 # wal: # compression: bzip2 -# maxParallel: 2 -# -# # Tags for backup organization -# tags: -# environment: "development" -# managed-by: "cloudnative-pg" -# -# # Backup history and retention -# historyTags: -# environment: "development" diff --git a/apps/harbor/README.md b/apps/harbor/README.md new file mode 100644 index 0000000..e69de29 diff --git a/apps/harbor/templates/extra-objects.yaml b/apps/harbor/templates/extra-objects.yaml new file mode 100644 index 0000000..e69de29 diff --git a/apps/logging/loki/Chart.yaml b/apps/logging/loki/Chart.yaml new file mode 100644 index 0000000..3cdec7e --- /dev/null +++ b/apps/logging/loki/Chart.yaml @@ -0,0 +1,11 @@ +apiVersion: v2 +name: loki +description: Grafana Loki logging stack wrapper chart +type: application +version: 1.0.0 +appVersion: "3.5.7" + +dependencies: + - name: loki + version: 6.45.2 + repository: https://grafana.github.io/helm-charts diff --git a/apps/logging/loki/application.yaml b/apps/logging/loki/application.yaml new file mode 100644 index 0000000..2d5f096 --- /dev/null +++ b/apps/logging/loki/application.yaml @@ -0,0 +1,30 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: loki + namespace: argocd + annotations: + argocd.argoproj.io/sync-wave: "1" + finalizers: + - resources-finalizer.argocd.argoproj.io +spec: + project: default + source: + repoURL: https://git.mvzijl.nl/marco/veda.git + targetRevision: applicationset-rewrite + path: apps/logging/loki + helm: + releaseName: loki + valueFiles: + - values.yaml + destination: + server: https://kubernetes.default.svc + namespace: logging + syncPolicy: + automated: + prune: true + selfHeal: true + syncOptions: + - CreateNamespace=true + - ServerSideApply=true + - SkipDryRunOnMissingResource=true diff --git a/apps/logging/loki/templates/extra-objects.yaml b/apps/logging/loki/templates/extra-objects.yaml new file mode 100644 index 0000000..8dd36ec --- /dev/null +++ b/apps/logging/loki/templates/extra-objects.yaml @@ -0,0 +1,4 @@ +{{- range .Values.extraObjects }} +--- +{{ toYaml . }} +{{- end }} diff --git a/apps/logging/loki/values.yaml b/apps/logging/loki/values.yaml new file mode 100644 index 0000000..d1f19fa --- /dev/null +++ b/apps/logging/loki/values.yaml @@ -0,0 +1,152 @@ +loki: + # Single binary deployment mode + deploymentMode: SingleBinary + + loki: + # Authentication + auth_enabled: false + + # Common configuration + commonConfig: + replication_factor: 1 + + # Storage configuration + schemaConfig: + configs: + - from: "2024-01-01" + store: tsdb + object_store: s3 + schema: v13 + index: + prefix: loki_index_ + period: 24h + + # Storage backend configuration + storage: + type: s3 + bucketNames: + chunks: loki-logs + ruler: loki-logs + admin: loki-logs + s3: + endpoint: rook-ceph-rgw-ceph-objectstore.rook-ceph.svc:80 + region: us-east-1 + insecure: true + s3ForcePathStyle: true + accessKeyId: ${AWS_ACCESS_KEY_ID} + secretAccessKey: ${AWS_SECRET_ACCESS_KEY} + + # Limits and retention + limits_config: + retention_period: 90d + ingestion_rate_mb: 10 + ingestion_burst_size_mb: 20 + max_query_series: 10000 + max_query_parallelism: 32 + reject_old_samples: true + reject_old_samples_max_age: 168h + + # Compactor configuration for retention + compactor: + working_directory: /var/loki/compactor + compaction_interval: 10m + retention_enabled: true + retention_delete_delay: 2h + retention_delete_worker_count: 150 + + # Storage config + storage_config: + tsdb_shipper: + active_index_directory: /var/loki/tsdb-index + cache_location: /var/loki/tsdb-cache + shared_store: s3 + + # Hedging requests + hedging: + at: 250ms + max_per_second: 20 + up_to: 3 + + # Query configuration + query_scheduler: + max_outstanding_requests_per_tenant: 2048 + + # Frontend configuration + frontend: + max_outstanding_per_tenant: 2048 + + # Single binary configuration + singleBinary: + replicas: 1 + persistence: + enabled: true + storageClass: ceph-block + size: 10Gi + + resources: + requests: + cpu: 100m + memory: 256Mi + limits: + memory: 1Gi + + extraEnv: + - name: AWS_ACCESS_KEY_ID + valueFrom: + secretKeyRef: + name: loki-objstore-secret + key: AWS_ACCESS_KEY_ID + - name: AWS_SECRET_ACCESS_KEY + valueFrom: + secretKeyRef: + name: loki-objstore-secret + key: AWS_SECRET_ACCESS_KEY + + # Gateway + gateway: + enabled: true + replicas: 1 + resources: + requests: + cpu: 50m + memory: 128Mi + limits: + memory: 256Mi + + # Monitoring + monitoring: + selfMonitoring: + enabled: true + grafanaAgent: + installOperator: false + serviceMonitor: + enabled: true + + # Service configuration + service: + type: ClusterIP + +# S3 Bucket and credentials provisioning +extraObjects: + # ObjectBucketClaim for Loki logs + - apiVersion: objectbucket.io/v1alpha1 + kind: ObjectBucketClaim + metadata: + name: loki-logs + namespace: logging + spec: + bucketName: loki-logs + storageClassName: ceph-bucket + additionalConfig: + maxSize: "200Gi" + + # Secret with S3 credentials (populated by Rook from OBC) + - apiVersion: v1 + kind: Secret + metadata: + name: loki-objstore-secret + namespace: logging + type: Opaque + stringData: + AWS_ACCESS_KEY_ID: placeholder + AWS_SECRET_ACCESS_KEY: placeholder diff --git a/apps/logging/promtail/Chart.yaml b/apps/logging/promtail/Chart.yaml new file mode 100644 index 0000000..54cd9d8 --- /dev/null +++ b/apps/logging/promtail/Chart.yaml @@ -0,0 +1,11 @@ +apiVersion: v2 +name: promtail +description: Promtail log collection agent wrapper chart +type: application +version: 1.0.0 +appVersion: "3.3.2" + +dependencies: + - name: promtail + version: 6.17.1 + repository: https://grafana.github.io/helm-charts diff --git a/apps/logging/promtail/application.yaml b/apps/logging/promtail/application.yaml new file mode 100644 index 0000000..82c6ee7 --- /dev/null +++ b/apps/logging/promtail/application.yaml @@ -0,0 +1,29 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: promtail + namespace: argocd + annotations: + argocd.argoproj.io/sync-wave: "3" + finalizers: + - resources-finalizer.argocd.argoproj.io +spec: + project: default + source: + repoURL: https://git.mvzijl.nl/marco/veda.git + targetRevision: applicationset-rewrite + path: apps/logging/promtail + helm: + releaseName: promtail + valueFiles: + - values.yaml + destination: + server: https://kubernetes.default.svc + namespace: logging + syncPolicy: + automated: + prune: true + selfHeal: true + syncOptions: + - CreateNamespace=false + - ServerSideApply=true diff --git a/apps/logging/promtail/values.yaml b/apps/logging/promtail/values.yaml new file mode 100644 index 0000000..b8fd430 --- /dev/null +++ b/apps/logging/promtail/values.yaml @@ -0,0 +1,163 @@ +promtail: + # DaemonSet configuration + daemonset: + enabled: true + + # Resources + resources: + requests: + cpu: 50m + memory: 128Mi + limits: + memory: 256Mi + + # Configuration + config: + # Loki endpoint + clients: + - url: http://loki-gateway.logging.svc.cluster.local/loki/api/v1/push + tenant_id: "" + batchwait: 1s + batchsize: 1048576 + timeout: 10s + + # Positions file (persisted) + positions: + filename: /run/promtail/positions.yaml + + # Server config + server: + log_level: info + http_listen_port: 3101 + + # Scrape configs + scrape_configs: + # Kubernetes pods + - job_name: kubernetes-pods + pipeline_stages: + # Extract log level + - regex: + expression: '(?i)(?Ptrace|debug|info|warn|warning|error|err|fatal|critical|panic)' + + # Parse JSON logs + - json: + expressions: + level: level + timestamp: timestamp + message: message + + # Drop high-cardinality labels + - labeldrop: + - pod_uid + - container_id + - image_id + - stream + + # Add log level as label (only keep certain levels) + - labels: + level: + + kubernetes_sd_configs: + - role: pod + + relabel_configs: + # Only scrape running pods + - source_labels: [__meta_kubernetes_pod_phase] + action: keep + regex: Running + + # Keep essential labels + - source_labels: [__meta_kubernetes_namespace] + target_label: namespace + + - source_labels: [__meta_kubernetes_pod_name] + target_label: pod + + - source_labels: [__meta_kubernetes_pod_label_app] + target_label: app + + - source_labels: [__meta_kubernetes_pod_container_name] + target_label: container + + - source_labels: [__meta_kubernetes_pod_node_name] + target_label: node + + # Add cluster label + - replacement: homelab + target_label: cluster + + # Drop pods in kube-system namespace (optional) + # - source_labels: [__meta_kubernetes_namespace] + # action: drop + # regex: kube-system + + # Container log path + - source_labels: [__meta_kubernetes_pod_uid, __meta_kubernetes_pod_container_name] + target_label: __path__ + separator: / + replacement: /var/log/pods/*$1/*.log + + # Journald logs (systemd) + - job_name: systemd-journal + journal: + path: /var/log/journal + max_age: 12h + labels: + job: systemd-journal + cluster: homelab + + pipeline_stages: + # Parse priority to log level + - match: + selector: '{job="systemd-journal"}' + stages: + - template: + source: level + template: '{{ if eq .PRIORITY "0" }}fatal{{ else if eq .PRIORITY "1" }}alert{{ else if eq .PRIORITY "2" }}crit{{ else if eq .PRIORITY "3" }}error{{ else if eq .PRIORITY "4" }}warning{{ else if eq .PRIORITY "5" }}notice{{ else if eq .PRIORITY "6" }}info{{ else }}debug{{ end }}' + + - labels: + level: + + relabel_configs: + - source_labels: [__journal__systemd_unit] + target_label: unit + + - source_labels: [__journal__hostname] + target_label: node + + - source_labels: [__journal_syslog_identifier] + target_label: syslog_identifier + + # Volumes + extraVolumes: + - name: journal + hostPath: + path: /var/log/journal + + - name: positions + hostPath: + path: /var/lib/promtail/positions + type: DirectoryOrCreate + + extraVolumeMounts: + - name: journal + mountPath: /var/log/journal + readOnly: true + + - name: positions + mountPath: /run/promtail + + # Tolerations to run on all nodes + tolerations: + - effect: NoSchedule + operator: Exists + + # Service Monitor + serviceMonitor: + enabled: true + + # Update strategy + updateStrategy: + type: RollingUpdate + rollingUpdate: + maxUnavailable: 1 diff --git a/apps/monitoring/grafana/Chart.yaml b/apps/monitoring/grafana/Chart.yaml new file mode 100644 index 0000000..1c252fa --- /dev/null +++ b/apps/monitoring/grafana/Chart.yaml @@ -0,0 +1,11 @@ +apiVersion: v2 +name: grafana +description: Grafana visualization platform wrapper chart +type: application +version: 1.0.0 +appVersion: "12.2.1" + +dependencies: + - name: grafana + version: 10.1.4 + repository: https://grafana.github.io/helm-charts diff --git a/apps/monitoring/grafana/application.yaml b/apps/monitoring/grafana/application.yaml new file mode 100644 index 0000000..e738188 --- /dev/null +++ b/apps/monitoring/grafana/application.yaml @@ -0,0 +1,38 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: grafana + namespace: argocd + annotations: + argocd.argoproj.io/sync-wave: "2" + finalizers: + - resources-finalizer.argocd.argoproj.io +spec: + project: default + source: + repoURL: https://git.mvzijl.nl/marco/veda.git + targetRevision: applicationset-rewrite + path: apps/monitoring/grafana + helm: + releaseName: grafana + valueFiles: + - values.yaml + destination: + server: https://kubernetes.default.svc + namespace: monitoring + syncPolicy: + automated: + prune: true + selfHeal: true + syncOptions: + - CreateNamespace=false + - ServerSideApply=true + ignoreDifferences: + - group: gateway.networking.k8s.io + kind: HTTPRoute + jsonPointers: + - /spec/parentRefs/0/group + - /spec/parentRefs/0/kind + - /spec/rules/0/backendRefs/0/group + - /spec/rules/0/backendRefs/0/kind + - /spec/rules/0/backendRefs/0/weight diff --git a/apps/monitoring/grafana/templates/extra-objects.yaml b/apps/monitoring/grafana/templates/extra-objects.yaml new file mode 100644 index 0000000..8dd36ec --- /dev/null +++ b/apps/monitoring/grafana/templates/extra-objects.yaml @@ -0,0 +1,4 @@ +{{- range .Values.extraObjects }} +--- +{{ toYaml . }} +{{- end }} diff --git a/apps/monitoring/grafana/values.yaml b/apps/monitoring/grafana/values.yaml new file mode 100644 index 0000000..77573c7 --- /dev/null +++ b/apps/monitoring/grafana/values.yaml @@ -0,0 +1,177 @@ +grafana: + # Admin credentials + adminUser: admin + adminPassword: changeme # TODO: Use secret management + + # Persistence + persistence: + enabled: true + storageClassName: ceph-block + size: 10Gi + + # Resources + resources: + requests: + cpu: 100m + memory: 256Mi + limits: + memory: 512Mi + + # Datasources + datasources: + datasources.yaml: + apiVersion: 1 + datasources: + # Thanos datasource + - name: Thanos + type: prometheus + access: proxy + url: http://thanos-query-frontend.monitoring.svc.cluster.local:9090 + isDefault: true + editable: false + jsonData: + timeInterval: 30s + queryTimeout: 60s + + # Loki datasource + - name: Loki + type: loki + access: proxy + url: http://loki-gateway.logging.svc.cluster.local + editable: false + jsonData: + maxLines: 1000 + derivedFields: + - datasourceUid: Thanos + matcherRegex: "traceID=(\\w+)" + name: TraceID + url: "$${__value.raw}" + + # Dashboard providers + dashboardProviders: + dashboardproviders.yaml: + apiVersion: 1 + providers: + - name: 'default' + orgId: 1 + folder: '' + type: file + disableDeletion: false + editable: true + options: + path: /var/lib/grafana/dashboards/default + - name: 'kubernetes' + orgId: 1 + folder: 'Kubernetes' + type: file + disableDeletion: false + editable: true + options: + path: /var/lib/grafana/dashboards/kubernetes + + # Preload dashboards + dashboards: + default: + # Node exporter dashboard + node-exporter: + gnetId: 1860 + revision: 37 + datasource: Thanos + + # Kubernetes cluster monitoring + k8s-cluster: + gnetId: 7249 + revision: 1 + datasource: Thanos + + kubernetes: + # Kubernetes pods + k8s-pods: + gnetId: 6417 + revision: 1 + datasource: Thanos + + # Loki logs dashboard + loki-logs: + gnetId: 13639 + revision: 2 + datasource: Loki + + # Grafana config + grafana.ini: + server: + root_url: https://grafana.noxxos.nl + serve_from_sub_path: false + + # Authentication - Authentik OIDC + auth.generic_oauth: + enabled: true + name: Authentik + client_id: grafana # TODO: Use secret + client_secret: changeme # TODO: Use secret management + scopes: openid profile email + auth_url: https://auth.noxxos.nl/application/o/authorize/ + token_url: https://auth.noxxos.nl/application/o/token/ + api_url: https://auth.noxxos.nl/application/o/userinfo/ + role_attribute_path: contains(groups[*], 'Grafana Admins') && 'Admin' || contains(groups[*], 'Grafana Editors') && 'Editor' || 'Viewer' + allow_sign_up: true + + analytics: + reporting_enabled: false + check_for_updates: false + + log: + mode: console + level: info + + users: + auto_assign_org: true + auto_assign_org_role: Viewer + + # Service Monitor + serviceMonitor: + enabled: true + + # Plugins + plugins: + - grafana-piechart-panel + - grafana-clock-panel + +# Gateway API HTTPRoute +extraObjects: + # ReferenceGrant + - apiVersion: gateway.networking.k8s.io/v1beta1 + kind: ReferenceGrant + metadata: + name: traefik-gateway-access + namespace: monitoring + spec: + from: + - group: gateway.networking.k8s.io + kind: HTTPRoute + namespace: monitoring + to: + - group: "" + kind: Service + + # Grafana HTTPRoute + - apiVersion: gateway.networking.k8s.io/v1 + kind: HTTPRoute + metadata: + name: grafana + namespace: monitoring + spec: + parentRefs: + - name: traefik-gateway + namespace: traefik + sectionName: websecure + hostnames: + - "grafana.noxxos.nl" + rules: + - matches: + - path: + type: PathPrefix + value: / + backendRefs: + - name: grafana + port: 80 diff --git a/apps/monitoring/prometheus/Chart.yaml b/apps/monitoring/prometheus/Chart.yaml new file mode 100644 index 0000000..cd56bf4 --- /dev/null +++ b/apps/monitoring/prometheus/Chart.yaml @@ -0,0 +1,11 @@ +apiVersion: v2 +name: prometheus +description: Prometheus monitoring stack with Thanos sidecar wrapper chart +type: application +version: 1.0.0 +appVersion: "0.86.2" + +dependencies: + - name: kube-prometheus-stack + version: 79.4.1 + repository: oci://ghcr.io/prometheus-community/charts diff --git a/apps/monitoring/prometheus/application.yaml b/apps/monitoring/prometheus/application.yaml new file mode 100644 index 0000000..4399fc2 --- /dev/null +++ b/apps/monitoring/prometheus/application.yaml @@ -0,0 +1,30 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: prometheus + namespace: argocd + annotations: + argocd.argoproj.io/sync-wave: "2" + finalizers: + - resources-finalizer.argocd.argoproj.io +spec: + project: default + source: + repoURL: https://git.mvzijl.nl/marco/veda.git + targetRevision: applicationset-rewrite + path: apps/monitoring/prometheus + helm: + releaseName: prometheus + valueFiles: + - values.yaml + destination: + server: https://kubernetes.default.svc + namespace: monitoring + syncPolicy: + automated: + prune: true + selfHeal: true + syncOptions: + - CreateNamespace=false + - ServerSideApply=true + - SkipDryRunOnMissingResource=true diff --git a/apps/monitoring/prometheus/values.yaml b/apps/monitoring/prometheus/values.yaml new file mode 100644 index 0000000..ae39744 --- /dev/null +++ b/apps/monitoring/prometheus/values.yaml @@ -0,0 +1,138 @@ +kube-prometheus-stack: + # Prometheus Operator + prometheusOperator: + enabled: true + resources: + requests: + cpu: 50m + memory: 128Mi + limits: + memory: 256Mi + + # Prometheus configuration + prometheus: + enabled: true + + prometheusSpec: + # Retention + retention: 24h + retentionSize: 15GB + + # Resources + resources: + requests: + cpu: 200m + memory: 1Gi + limits: + memory: 2Gi + + # Storage + storageSpec: + volumeClaimTemplate: + spec: + storageClassName: ceph-block + accessModes: ["ReadWriteOnce"] + resources: + requests: + storage: 20Gi + + # Thanos sidecar configuration + thanos: + image: quay.io/thanos/thanos:v0.37.2 + version: v0.37.2 + objectStorageConfig: + name: thanos-objstore-secret + key: objstore.yml + + # External labels for Thanos + externalLabels: + cluster: homelab + prometheus: monitoring/prometheus + + # Replicas + replicas: 1 + replicaExternalLabelName: prometheus_replica + + # Service monitors + serviceMonitorSelectorNilUsesHelmValues: false + podMonitorSelectorNilUsesHelmValues: false + ruleSelectorNilUsesHelmValues: false + + # Additional scrape configs + additionalScrapeConfigs: [] + + # Alertmanager + alertmanager: + enabled: true + alertmanagerSpec: + replicas: 1 + storage: + volumeClaimTemplate: + spec: + storageClassName: ceph-block + accessModes: ["ReadWriteOnce"] + resources: + requests: + storage: 5Gi + resources: + requests: + cpu: 50m + memory: 128Mi + limits: + memory: 256Mi + + # Grafana (disabled - using separate Grafana deployment) + grafana: + enabled: false + + # Node Exporter + nodeExporter: + enabled: true + resources: + requests: + cpu: 50m + memory: 64Mi + limits: + memory: 128Mi + + # Kube State Metrics + kubeStateMetrics: + enabled: true + resources: + requests: + cpu: 50m + memory: 128Mi + limits: + memory: 256Mi + + # Default rules + defaultRules: + create: true + rules: + alertmanager: true + etcd: false + configReloaders: true + general: true + k8s: true + kubeApiserverAvailability: true + kubeApiserverBurnrate: true + kubeApiserverHistogram: true + kubeApiserverSlos: true + kubeControllerManager: true + kubelet: true + kubeProxy: true + kubePrometheusGeneral: true + kubePrometheusNodeRecording: true + kubernetesApps: true + kubernetesResources: true + kubernetesStorage: true + kubernetesSystem: true + kubeSchedulerAlerting: true + kubeSchedulerRecording: true + kubeStateMetrics: true + network: true + node: true + nodeExporterAlerting: true + nodeExporterRecording: true + prometheus: true + prometheusOperator: true diff --git a/apps/monitoring/thanos/Chart.yaml b/apps/monitoring/thanos/Chart.yaml new file mode 100644 index 0000000..7fdcbeb --- /dev/null +++ b/apps/monitoring/thanos/Chart.yaml @@ -0,0 +1,11 @@ +apiVersion: v2 +name: thanos +description: Thanos distributed metrics wrapper chart +type: application +version: 1.0.0 +appVersion: "0.40.1" + +dependencies: + - name: thanos + version: 1.22.0 + repository: oci://ghcr.io/stevehipwell/helm-charts diff --git a/apps/monitoring/thanos/application.yaml b/apps/monitoring/thanos/application.yaml new file mode 100644 index 0000000..15e24ca --- /dev/null +++ b/apps/monitoring/thanos/application.yaml @@ -0,0 +1,30 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: thanos + namespace: argocd + annotations: + argocd.argoproj.io/sync-wave: "1" + finalizers: + - resources-finalizer.argocd.argoproj.io +spec: + project: default + source: + repoURL: https://git.mvzijl.nl/marco/veda.git + targetRevision: applicationset-rewrite + path: apps/monitoring/thanos + helm: + releaseName: thanos + valueFiles: + - values.yaml + destination: + server: https://kubernetes.default.svc + namespace: monitoring + syncPolicy: + automated: + prune: true + selfHeal: true + syncOptions: + - CreateNamespace=true + - ServerSideApply=true + - SkipDryRunOnMissingResource=true diff --git a/apps/monitoring/thanos/templates/extra-objects.yaml b/apps/monitoring/thanos/templates/extra-objects.yaml new file mode 100644 index 0000000..8dd36ec --- /dev/null +++ b/apps/monitoring/thanos/templates/extra-objects.yaml @@ -0,0 +1,4 @@ +{{- range .Values.extraObjects }} +--- +{{ toYaml . }} +{{- end }} diff --git a/apps/monitoring/thanos/values.yaml b/apps/monitoring/thanos/values.yaml new file mode 100644 index 0000000..2c7bf30 --- /dev/null +++ b/apps/monitoring/thanos/values.yaml @@ -0,0 +1,130 @@ +thanos: + # Object storage configuration + objstoreConfig: + create: false # We create the secret via extraObjects + name: thanos-objstore-secret + key: objstore.yml + + # Image configuration + image: + registry: quay.io + repository: thanos/thanos + tag: v0.40.1 + + # Query component + query: + enabled: true + replicaCount: 2 + resources: + requests: + cpu: 100m + memory: 256Mi + limits: + memory: 512Mi + stores: + - dnssrv+_grpc._tcp.thanos-storegateway.monitoring.svc.cluster.local + - dnssrv+_grpc._tcp.thanos-receive.monitoring.svc.cluster.local + + # Query Frontend + queryFrontend: + enabled: true + replicaCount: 1 + resources: + requests: + cpu: 50m + memory: 128Mi + limits: + memory: 256Mi + + # Store Gateway + storegateway: + enabled: true + replicaCount: 1 + persistence: + enabled: true + storageClass: ceph-block + size: 10Gi + resources: + requests: + cpu: 100m + memory: 512Mi + limits: + memory: 1Gi + + # Compactor + compactor: + enabled: true + persistence: + enabled: true + storageClass: ceph-block + size: 10Gi + retentionResolutionRaw: 14d + retentionResolution5m: 90d + retentionResolution1h: 2y + resources: + requests: + cpu: 100m + memory: 512Mi + limits: + memory: 1Gi + extraFlags: + - --deduplication.replica-label=prometheus_replica + - --deduplication.replica-label=replica + - --downsampling.disable=false + - --compact.enable-vertical-compaction + + # Receive (for remote write from Prometheus) + receive: + enabled: true + replicaCount: 1 + persistence: + enabled: true + storageClass: ceph-block + size: 20Gi + resources: + requests: + cpu: 100m + memory: 512Mi + limits: + memory: 1Gi + + # Metrics and caching + # Note: Memcached configuration would be added here if using external caching + + # Metrics + metrics: + enabled: true + serviceMonitor: + enabled: true + +# S3 Bucket and credentials provisioning +extraObjects: + # ObjectBucketClaim for Thanos metrics + - apiVersion: objectbucket.io/v1alpha1 + kind: ObjectBucketClaim + metadata: + name: thanos-metrics + namespace: monitoring + spec: + bucketName: thanos-metrics + storageClassName: ceph-bucket + additionalConfig: + maxSize: "500Gi" + + # Secret with S3 credentials (will be populated by Rook) + # This is a placeholder - actual credentials come from the OBC + - apiVersion: v1 + kind: Secret + metadata: + name: thanos-objstore-secret + namespace: monitoring + type: Opaque + stringData: + objstore.yml: |- + type: S3 + config: + bucket: thanos-metrics + endpoint: rook-ceph-rgw-ceph-objectstore.rook-ceph.svc:80 + insecure: true + access_key: ${AWS_ACCESS_KEY_ID} + secret_key: ${AWS_SECRET_ACCESS_KEY} diff --git a/apps/traefik/post-install/dashboard-httproute.yaml b/apps/traefik/post-install/dashboard-httproute.yaml new file mode 100644 index 0000000..e69de29 diff --git a/apps/traefik/post-install/gateway.yaml b/apps/traefik/post-install/gateway.yaml new file mode 100644 index 0000000..e69de29 diff --git a/apps/traefik/post-install/reference-grant.yaml b/apps/traefik/post-install/reference-grant.yaml new file mode 100644 index 0000000..e69de29 diff --git a/apps/traefik/templates/dashboard-httproute.yaml b/apps/traefik/templates/dashboard-httproute.yaml new file mode 100644 index 0000000..e69de29 diff --git a/apps/traefik/templates/gateway.yaml b/apps/traefik/templates/gateway.yaml new file mode 100644 index 0000000..e69de29 diff --git a/apps/traefik/templates/reference-grant.yaml b/apps/traefik/templates/reference-grant.yaml new file mode 100644 index 0000000..e69de29 diff --git a/apps/validate-manifests.sh b/apps/validate-manifests.sh new file mode 100755 index 0000000..dcd661c --- /dev/null +++ b/apps/validate-manifests.sh @@ -0,0 +1,209 @@ +#!/bin/bash +# Kubernetes/Helm Configuration Validator +# Validates all applications without deploying them + +set -e + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# Counters +TOTAL=0 +PASSED=0 +FAILED=0 + +echo -e "${BLUE}=== Kubernetes Configuration Validator ===${NC}\n" + +# Function to validate a Helm chart +validate_helm_chart() { + local app_path=$1 + local app_name=$(basename "$app_path") + local namespace=$2 + + TOTAL=$((TOTAL + 1)) + + echo -e "${YELLOW}[$TOTAL] Validating: $app_name (namespace: $namespace)${NC}" + + # Check if Chart.yaml exists + if [ ! -f "$app_path/Chart.yaml" ]; then + echo -e "${RED} ✗ No Chart.yaml found${NC}\n" + FAILED=$((FAILED + 1)) + return 1 + fi + + # Check if dependencies are built (build to temp location if not) + if [ -f "$app_path/Chart.yaml" ] && grep -q "dependencies:" "$app_path/Chart.yaml"; then + if [ ! -d "$app_path/charts" ]; then + echo " → Dependencies not built - building to temporary location..." + + # Create temp directory + local temp_dir=$(mktemp -d) + trap "rm -rf $temp_dir" EXIT + + # Copy chart to temp location + cp -r "$app_path" "$temp_dir/" + local temp_chart="$temp_dir/$(basename "$app_path")" + + # Build dependencies in temp location + if ! (cd "$temp_chart" && helm dependency build > /dev/null 2>&1); then + echo -e "${RED} ✗ Failed to build dependencies${NC}\n" + FAILED=$((FAILED + 1)) + return 1 + fi + + # Use temp location for validation + app_path="$temp_chart" + fi + fi + + # Lint the chart + echo " → Running Helm lint..." + if ! (cd "$app_path" && helm lint . 2>&1 | grep -q "0 chart(s) failed"); then + echo -e "${RED} ✗ Helm lint failed${NC}" + (cd "$app_path" && helm lint .) + echo "" + FAILED=$((FAILED + 1)) + return 1 + fi + + # Template the chart + echo " → Rendering Helm templates..." + if ! (cd "$app_path" && helm template "$app_name" . --namespace "$namespace" --validate > /dev/null 2>&1); then + echo -e "${RED} ✗ Helm template failed${NC}" + (cd "$app_path" && helm template "$app_name" . --namespace "$namespace" --validate 2>&1 | head -20) + echo "" + FAILED=$((FAILED + 1)) + return 1 + fi + + # Validate with kubeval (if installed) + if command -v kubeval &> /dev/null; then + echo " → Validating manifests with kubeval..." + if ! (cd "$app_path" && helm template "$app_name" . --namespace "$namespace" | kubeval --ignore-missing-schemas > /dev/null 2>&1); then + echo -e "${YELLOW} ⚠ Kubeval warnings (may be acceptable)${NC}" + fi + fi + + # Check for common issues + echo " → Checking for common issues..." + local rendered=$(cd "$app_path" && helm template "$app_name" . --namespace "$namespace" 2>&1) + + # Check for placeholder secrets + if echo "$rendered" | grep -qi "changeme\|placeholder\|CHANGE_ME\|TODO"; then + echo -e "${YELLOW} ⚠ Warning: Found placeholder values (changeme/placeholder/TODO)${NC}" + fi + + # Check for resource requests/limits + if ! echo "$rendered" | grep -q "resources:"; then + echo -e "${YELLOW} ⚠ Warning: No resource requests/limits found${NC}" + fi + + echo -e "${GREEN} ✓ Validation passed${NC}\n" + PASSED=$((PASSED + 1)) + return 0 +} + +# Function to validate an ArgoCD Application manifest +validate_argocd_app() { + local app_file=$1 + local app_name=$(basename "$(dirname "$app_file")") + + TOTAL=$((TOTAL + 1)) + + echo -e "${YELLOW}[$TOTAL] Validating ArgoCD Application: $app_name${NC}" + + # Check YAML syntax + if ! python3 -c "import yaml; yaml.safe_load(open('$app_file'))" 2>/dev/null; then + echo -e "${RED} ✗ Invalid YAML syntax${NC}\n" + FAILED=$((FAILED + 1)) + return 1 + fi + + # Check for required fields + local missing_fields=() + grep -q "kind: Application" "$app_file" || missing_fields+=("kind: Application") + grep -q "metadata:" "$app_file" || missing_fields+=("metadata") + grep -q "spec:" "$app_file" || missing_fields+=("spec") + grep -q "source:" "$app_file" || missing_fields+=("source") + grep -q "destination:" "$app_file" || missing_fields+=("destination") + + if [ ${#missing_fields[@]} -gt 0 ]; then + echo -e "${RED} ✗ Missing required fields: ${missing_fields[*]}${NC}\n" + FAILED=$((FAILED + 1)) + return 1 + fi + + echo -e "${GREEN} ✓ Validation passed${NC}\n" + PASSED=$((PASSED + 1)) + return 0 +} + +# Main validation flow +echo -e "${BLUE}Validating Monitoring Stack...${NC}\n" + +# Thanos +if [ -d "apps/monitoring/thanos" ]; then + validate_helm_chart "apps/monitoring/thanos" "monitoring" + validate_argocd_app "apps/monitoring/thanos/application.yaml" +fi + +# Prometheus +if [ -d "apps/monitoring/prometheus" ]; then + validate_helm_chart "apps/monitoring/prometheus" "monitoring" + validate_argocd_app "apps/monitoring/prometheus/application.yaml" +fi + +# Grafana +if [ -d "apps/monitoring/grafana" ]; then + validate_helm_chart "apps/monitoring/grafana" "monitoring" + validate_argocd_app "apps/monitoring/grafana/application.yaml" +fi + +echo -e "${BLUE}Validating Logging Stack...${NC}\n" + +# Loki +if [ -d "apps/logging/loki" ]; then + validate_helm_chart "apps/logging/loki" "logging" + validate_argocd_app "apps/logging/loki/application.yaml" +fi + +# Promtail +if [ -d "apps/logging/promtail" ]; then + validate_helm_chart "apps/logging/promtail" "logging" + validate_argocd_app "apps/logging/promtail/application.yaml" +fi + +# Additional apps (if they exist) +echo -e "${BLUE}Validating Other Applications...${NC}\n" + +for app_dir in apps/*/; do + app_name=$(basename "$app_dir") + if [ -f "$app_dir/Chart.yaml" ] && [ -f "$app_dir/application.yaml" ]; then + # Skip if already validated + if [[ "$app_name" != "monitoring" ]] && [[ "$app_name" != "logging" ]]; then + # Try to extract namespace from application.yaml + namespace=$(grep -A 10 "destination:" "$app_dir/application.yaml" | grep "namespace:" | head -1 | awk '{print $2}') + [ -z "$namespace" ] && namespace="default" + validate_helm_chart "$app_dir" "$namespace" + validate_argocd_app "$app_dir/application.yaml" + fi + fi +done + +# Summary +echo -e "${BLUE}=== Validation Summary ===${NC}" +echo -e "Total checks: $TOTAL" +echo -e "${GREEN}Passed: $PASSED${NC}" +echo -e "${RED}Failed: $FAILED${NC}\n" + +if [ $FAILED -eq 0 ]; then + echo -e "${GREEN}✓ All validations passed!${NC}" + exit 0 +else + echo -e "${RED}✗ Some validations failed. Please review the errors above.${NC}" + exit 1 +fi