feat: last failed backup status field and metric (#467)

Signed-off-by: Leonardo Cecchi <leonardo.cecchi@enterprisedb.com>
Signed-off-by: Gabriele Bartolini <gabriele.bartolini@enterprisedb.com>
Co-authored-by: Gabriele Bartolini <gabriele.bartolini@enterprisedb.com>
This commit is contained in:
Leonardo Cecchi 2025-08-12 06:32:54 +02:00 committed by GitHub
parent 32a5539c18
commit 551a3cde09
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 79 additions and 4 deletions

View File

@ -75,6 +75,9 @@ type RecoveryWindow struct {
// The last successful backup time
LastSuccessfulBackupTime *metav1.Time `json:"lastSuccussfulBackupTime,omitempty"`
// The last failed backup time
LastFailedBackupTime *metav1.Time `json:"lastFailedBackupTime,omitempty"`
}
// +kubebuilder:object:root=true

View File

@ -157,6 +157,10 @@ func (in *RecoveryWindow) DeepCopyInto(out *RecoveryWindow) {
in, out := &in.LastSuccessfulBackupTime, &out.LastSuccessfulBackupTime
*out = (*in).DeepCopy()
}
if in.LastFailedBackupTime != nil {
in, out := &in.LastFailedBackupTime, &out.LastFailedBackupTime
*out = (*in).DeepCopy()
}
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RecoveryWindow.

View File

@ -609,6 +609,10 @@ spec:
restored.
format: date-time
type: string
lastFailedBackupTime:
description: The last failed backup time
format: date-time
type: string
lastSuccussfulBackupTime:
description: The last successful backup time
format: date-time

View File

@ -15,6 +15,7 @@ import (
"github.com/cloudnative-pg/machinery/pkg/log"
pgTime "github.com/cloudnative-pg/machinery/pkg/postgres/time"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/util/retry"
"sigs.k8s.io/controller-runtime/pkg/client"
barmancloudv1 "github.com/cloudnative-pg/plugin-barman-cloud/api/v1"
@ -101,6 +102,13 @@ func (b BackupServiceImplementation) Backup(
postgres.BackupTemporaryDirectory,
); err != nil {
contextLogger.Error(err, "while taking backup")
if failureHandlerError := b.handleBackupError(ctx, configuration); failureHandlerError != nil {
contextLogger.Error(
failureHandlerError,
"Error while handling backup failure, skipping. "+
"BarmanObjectStore object may be not up to date.")
}
return nil, err
}
@ -166,3 +174,18 @@ func (b BackupServiceImplementation) Backup(
Metadata: newBackupResultMetadata(configuration.Cluster.ObjectMeta.UID, executedBackupInfo.TimeLine).toMap(),
}, nil
}
func (b BackupServiceImplementation) handleBackupError(ctx context.Context, cfg *config.PluginConfiguration) error {
return retry.RetryOnConflict(
retry.DefaultBackoff,
func() error {
return setLastFailedBackupTime(
ctx,
b.Client,
cfg.GetBarmanObjectKey(),
cfg.ServerName,
time.Now(),
)
},
)
}

View File

@ -31,6 +31,7 @@ func buildFqName(name string) string {
var (
firstRecoverabilityPointMetricName = buildFqName("first_recoverability_point")
lastAvailableBackupTimestampMetricName = buildFqName("last_available_backup_timestamp")
lastFailedBackupTimestampMetricName = buildFqName("last_failed_backup_timestamp")
)
func (m metricsImpl) GetCapabilities(
@ -72,6 +73,11 @@ func (m metricsImpl) Define(
Help: "The last available backup as a unix timestamp",
ValueType: &metrics.MetricType{Type: metrics.MetricType_TYPE_GAUGE},
},
{
FqName: lastFailedBackupTimestampMetricName,
Help: "The last failed backup as a unix timestamp",
ValueType: &metrics.MetricType{Type: metrics.MetricType_TYPE_GAUGE},
},
},
}, nil
}
@ -107,18 +113,26 @@ func (m metricsImpl) Collect(
FqName: lastAvailableBackupTimestampMetricName,
Value: 0,
},
{
FqName: lastFailedBackupTimestampMetricName,
Value: 0,
},
},
}, nil
}
var firstRecoverabilityPoint float64
var lastAvailableBackup float64
var lastFailedBackup float64
if x.FirstRecoverabilityPoint != nil {
firstRecoverabilityPoint = float64(x.FirstRecoverabilityPoint.Unix())
}
if x.LastSuccessfulBackupTime != nil {
lastAvailableBackup = float64(x.LastSuccessfulBackupTime.Unix())
}
if x.LastFailedBackupTime != nil {
lastFailedBackup = float64(x.LastFailedBackupTime.Unix())
}
return &metrics.CollectMetricsResult{
Metrics: []*metrics.CollectMetric{
@ -130,6 +144,10 @@ func (m metricsImpl) Collect(
FqName: lastAvailableBackupTimestampMetricName,
Value: lastAvailableBackup,
},
{
FqName: lastFailedBackupTimestampMetricName,
Value: lastFailedBackup,
},
},
}, nil
}

View File

@ -6,6 +6,7 @@ import (
"github.com/cloudnative-pg/barman-cloud/pkg/catalog"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/util/retry"
"k8s.io/utils/ptr"
"sigs.k8s.io/controller-runtime/pkg/client"
@ -29,10 +30,9 @@ func updateRecoveryWindow(
return ptr.To(metav1.NewTime(*t))
}
recoveryWindow := barmancloudv1.RecoveryWindow{
FirstRecoverabilityPoint: convertTime(backupList.GetFirstRecoverabilityPoint()),
LastSuccessfulBackupTime: convertTime(backupList.GetLastSuccessfulBackupTime()),
}
recoveryWindow := objectStore.Status.ServerRecoveryWindow[serverName]
recoveryWindow.FirstRecoverabilityPoint = convertTime(backupList.GetFirstRecoverabilityPoint())
recoveryWindow.LastSuccessfulBackupTime = convertTime(backupList.GetLastSuccessfulBackupTime())
if objectStore.Status.ServerRecoveryWindow == nil {
objectStore.Status.ServerRecoveryWindow = make(map[string]barmancloudv1.RecoveryWindow)
@ -41,3 +41,25 @@ func updateRecoveryWindow(
return c.Status().Update(ctx, objectStore)
}
// setLastFailedBackupTime sets the last failed backup time in the
// passed object store, for the passed server name.
func setLastFailedBackupTime(
ctx context.Context,
c client.Client,
objectStoreKey client.ObjectKey,
serverName string,
lastFailedBackupTime time.Time,
) error {
return retry.RetryOnConflict(retry.DefaultBackoff, func() error {
var objectStore barmancloudv1.ObjectStore
if err := c.Get(ctx, objectStoreKey, &objectStore); err != nil {
return err
}
recoveryWindow := objectStore.Status.ServerRecoveryWindow[serverName]
recoveryWindow.LastFailedBackupTime = ptr.To(metav1.NewTime(lastFailedBackupTime))
objectStore.Status.ServerRecoveryWindow[serverName] = recoveryWindow
return c.Status().Update(ctx, &objectStore)
})
}

View File

@ -101,5 +101,6 @@ _Appears in:_
| --- | --- | --- | --- | --- |
| `firstRecoverabilityPoint` _[Time](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.32/#time-v1-meta)_ | The first recoverability point in a PostgreSQL server refers to<br />the earliest point in time to which the database can be<br />restored. | True | | |
| `lastSuccussfulBackupTime` _[Time](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.32/#time-v1-meta)_ | The last successful backup time | True | | |
| `lastFailedBackupTime` _[Time](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.32/#time-v1-meta)_ | The last failed backup time | True | | |