From 551a3cde09886d88851e751ab289e04630243a7c Mon Sep 17 00:00:00 2001 From: Leonardo Cecchi Date: Tue, 12 Aug 2025 06:32:54 +0200 Subject: [PATCH] feat: last failed backup status field and metric (#467) Signed-off-by: Leonardo Cecchi Signed-off-by: Gabriele Bartolini Co-authored-by: Gabriele Bartolini --- api/v1/objectstore_types.go | 3 ++ api/v1/zz_generated.deepcopy.go | 4 +++ .../barmancloud.cnpg.io_objectstores.yaml | 4 +++ internal/cnpgi/instance/backup.go | 23 ++++++++++++++ internal/cnpgi/instance/metrics.go | 18 +++++++++++ internal/cnpgi/instance/recovery_window.go | 30 ++++++++++++++++--- web/docs/plugin-barman-cloud.v1.md | 1 + 7 files changed, 79 insertions(+), 4 deletions(-) diff --git a/api/v1/objectstore_types.go b/api/v1/objectstore_types.go index 80c4742..1d0fcd2 100644 --- a/api/v1/objectstore_types.go +++ b/api/v1/objectstore_types.go @@ -75,6 +75,9 @@ type RecoveryWindow struct { // The last successful backup time LastSuccessfulBackupTime *metav1.Time `json:"lastSuccussfulBackupTime,omitempty"` + + // The last failed backup time + LastFailedBackupTime *metav1.Time `json:"lastFailedBackupTime,omitempty"` } // +kubebuilder:object:root=true diff --git a/api/v1/zz_generated.deepcopy.go b/api/v1/zz_generated.deepcopy.go index 11fb2ae..1f92d88 100644 --- a/api/v1/zz_generated.deepcopy.go +++ b/api/v1/zz_generated.deepcopy.go @@ -157,6 +157,10 @@ func (in *RecoveryWindow) DeepCopyInto(out *RecoveryWindow) { in, out := &in.LastSuccessfulBackupTime, &out.LastSuccessfulBackupTime *out = (*in).DeepCopy() } + if in.LastFailedBackupTime != nil { + in, out := &in.LastFailedBackupTime, &out.LastFailedBackupTime + *out = (*in).DeepCopy() + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RecoveryWindow. diff --git a/config/crd/bases/barmancloud.cnpg.io_objectstores.yaml b/config/crd/bases/barmancloud.cnpg.io_objectstores.yaml index 6fb87b5..a83729f 100644 --- a/config/crd/bases/barmancloud.cnpg.io_objectstores.yaml +++ b/config/crd/bases/barmancloud.cnpg.io_objectstores.yaml @@ -609,6 +609,10 @@ spec: restored. format: date-time type: string + lastFailedBackupTime: + description: The last failed backup time + format: date-time + type: string lastSuccussfulBackupTime: description: The last successful backup time format: date-time diff --git a/internal/cnpgi/instance/backup.go b/internal/cnpgi/instance/backup.go index 155a539..33d718f 100644 --- a/internal/cnpgi/instance/backup.go +++ b/internal/cnpgi/instance/backup.go @@ -15,6 +15,7 @@ import ( "github.com/cloudnative-pg/machinery/pkg/log" pgTime "github.com/cloudnative-pg/machinery/pkg/postgres/time" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/util/retry" "sigs.k8s.io/controller-runtime/pkg/client" barmancloudv1 "github.com/cloudnative-pg/plugin-barman-cloud/api/v1" @@ -101,6 +102,13 @@ func (b BackupServiceImplementation) Backup( postgres.BackupTemporaryDirectory, ); err != nil { contextLogger.Error(err, "while taking backup") + + if failureHandlerError := b.handleBackupError(ctx, configuration); failureHandlerError != nil { + contextLogger.Error( + failureHandlerError, + "Error while handling backup failure, skipping. "+ + "BarmanObjectStore object may be not up to date.") + } return nil, err } @@ -166,3 +174,18 @@ func (b BackupServiceImplementation) Backup( Metadata: newBackupResultMetadata(configuration.Cluster.ObjectMeta.UID, executedBackupInfo.TimeLine).toMap(), }, nil } + +func (b BackupServiceImplementation) handleBackupError(ctx context.Context, cfg *config.PluginConfiguration) error { + return retry.RetryOnConflict( + retry.DefaultBackoff, + func() error { + return setLastFailedBackupTime( + ctx, + b.Client, + cfg.GetBarmanObjectKey(), + cfg.ServerName, + time.Now(), + ) + }, + ) +} diff --git a/internal/cnpgi/instance/metrics.go b/internal/cnpgi/instance/metrics.go index f1614a1..70315ac 100644 --- a/internal/cnpgi/instance/metrics.go +++ b/internal/cnpgi/instance/metrics.go @@ -31,6 +31,7 @@ func buildFqName(name string) string { var ( firstRecoverabilityPointMetricName = buildFqName("first_recoverability_point") lastAvailableBackupTimestampMetricName = buildFqName("last_available_backup_timestamp") + lastFailedBackupTimestampMetricName = buildFqName("last_failed_backup_timestamp") ) func (m metricsImpl) GetCapabilities( @@ -72,6 +73,11 @@ func (m metricsImpl) Define( Help: "The last available backup as a unix timestamp", ValueType: &metrics.MetricType{Type: metrics.MetricType_TYPE_GAUGE}, }, + { + FqName: lastFailedBackupTimestampMetricName, + Help: "The last failed backup as a unix timestamp", + ValueType: &metrics.MetricType{Type: metrics.MetricType_TYPE_GAUGE}, + }, }, }, nil } @@ -107,18 +113,26 @@ func (m metricsImpl) Collect( FqName: lastAvailableBackupTimestampMetricName, Value: 0, }, + { + FqName: lastFailedBackupTimestampMetricName, + Value: 0, + }, }, }, nil } var firstRecoverabilityPoint float64 var lastAvailableBackup float64 + var lastFailedBackup float64 if x.FirstRecoverabilityPoint != nil { firstRecoverabilityPoint = float64(x.FirstRecoverabilityPoint.Unix()) } if x.LastSuccessfulBackupTime != nil { lastAvailableBackup = float64(x.LastSuccessfulBackupTime.Unix()) } + if x.LastFailedBackupTime != nil { + lastFailedBackup = float64(x.LastFailedBackupTime.Unix()) + } return &metrics.CollectMetricsResult{ Metrics: []*metrics.CollectMetric{ @@ -130,6 +144,10 @@ func (m metricsImpl) Collect( FqName: lastAvailableBackupTimestampMetricName, Value: lastAvailableBackup, }, + { + FqName: lastFailedBackupTimestampMetricName, + Value: lastFailedBackup, + }, }, }, nil } diff --git a/internal/cnpgi/instance/recovery_window.go b/internal/cnpgi/instance/recovery_window.go index 8e3aea9..5dcc962 100644 --- a/internal/cnpgi/instance/recovery_window.go +++ b/internal/cnpgi/instance/recovery_window.go @@ -6,6 +6,7 @@ import ( "github.com/cloudnative-pg/barman-cloud/pkg/catalog" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/util/retry" "k8s.io/utils/ptr" "sigs.k8s.io/controller-runtime/pkg/client" @@ -29,10 +30,9 @@ func updateRecoveryWindow( return ptr.To(metav1.NewTime(*t)) } - recoveryWindow := barmancloudv1.RecoveryWindow{ - FirstRecoverabilityPoint: convertTime(backupList.GetFirstRecoverabilityPoint()), - LastSuccessfulBackupTime: convertTime(backupList.GetLastSuccessfulBackupTime()), - } + recoveryWindow := objectStore.Status.ServerRecoveryWindow[serverName] + recoveryWindow.FirstRecoverabilityPoint = convertTime(backupList.GetFirstRecoverabilityPoint()) + recoveryWindow.LastSuccessfulBackupTime = convertTime(backupList.GetLastSuccessfulBackupTime()) if objectStore.Status.ServerRecoveryWindow == nil { objectStore.Status.ServerRecoveryWindow = make(map[string]barmancloudv1.RecoveryWindow) @@ -41,3 +41,25 @@ func updateRecoveryWindow( return c.Status().Update(ctx, objectStore) } + +// setLastFailedBackupTime sets the last failed backup time in the +// passed object store, for the passed server name. +func setLastFailedBackupTime( + ctx context.Context, + c client.Client, + objectStoreKey client.ObjectKey, + serverName string, + lastFailedBackupTime time.Time, +) error { + return retry.RetryOnConflict(retry.DefaultBackoff, func() error { + var objectStore barmancloudv1.ObjectStore + + if err := c.Get(ctx, objectStoreKey, &objectStore); err != nil { + return err + } + recoveryWindow := objectStore.Status.ServerRecoveryWindow[serverName] + recoveryWindow.LastFailedBackupTime = ptr.To(metav1.NewTime(lastFailedBackupTime)) + objectStore.Status.ServerRecoveryWindow[serverName] = recoveryWindow + return c.Status().Update(ctx, &objectStore) + }) +} diff --git a/web/docs/plugin-barman-cloud.v1.md b/web/docs/plugin-barman-cloud.v1.md index 552dc27..da4b958 100644 --- a/web/docs/plugin-barman-cloud.v1.md +++ b/web/docs/plugin-barman-cloud.v1.md @@ -101,5 +101,6 @@ _Appears in:_ | --- | --- | --- | --- | --- | | `firstRecoverabilityPoint` _[Time](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.32/#time-v1-meta)_ | The first recoverability point in a PostgreSQL server refers to
the earliest point in time to which the database can be
restored. | True | | | | `lastSuccussfulBackupTime` _[Time](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.32/#time-v1-meta)_ | The last successful backup time | True | | | +| `lastFailedBackupTime` _[Time](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.32/#time-v1-meta)_ | The last failed backup time | True | | |