feat: last failed backup status field and metric (#467)

Signed-off-by: Leonardo Cecchi <leonardo.cecchi@enterprisedb.com>
Signed-off-by: Gabriele Bartolini <gabriele.bartolini@enterprisedb.com>
Co-authored-by: Gabriele Bartolini <gabriele.bartolini@enterprisedb.com>
This commit is contained in:
Leonardo Cecchi 2025-08-12 06:32:54 +02:00 committed by GitHub
parent 32a5539c18
commit 551a3cde09
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 79 additions and 4 deletions

View File

@ -75,6 +75,9 @@ type RecoveryWindow struct {
// The last successful backup time // The last successful backup time
LastSuccessfulBackupTime *metav1.Time `json:"lastSuccussfulBackupTime,omitempty"` LastSuccessfulBackupTime *metav1.Time `json:"lastSuccussfulBackupTime,omitempty"`
// The last failed backup time
LastFailedBackupTime *metav1.Time `json:"lastFailedBackupTime,omitempty"`
} }
// +kubebuilder:object:root=true // +kubebuilder:object:root=true

View File

@ -157,6 +157,10 @@ func (in *RecoveryWindow) DeepCopyInto(out *RecoveryWindow) {
in, out := &in.LastSuccessfulBackupTime, &out.LastSuccessfulBackupTime in, out := &in.LastSuccessfulBackupTime, &out.LastSuccessfulBackupTime
*out = (*in).DeepCopy() *out = (*in).DeepCopy()
} }
if in.LastFailedBackupTime != nil {
in, out := &in.LastFailedBackupTime, &out.LastFailedBackupTime
*out = (*in).DeepCopy()
}
} }
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RecoveryWindow. // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RecoveryWindow.

View File

@ -609,6 +609,10 @@ spec:
restored. restored.
format: date-time format: date-time
type: string type: string
lastFailedBackupTime:
description: The last failed backup time
format: date-time
type: string
lastSuccussfulBackupTime: lastSuccussfulBackupTime:
description: The last successful backup time description: The last successful backup time
format: date-time format: date-time

View File

@ -15,6 +15,7 @@ import (
"github.com/cloudnative-pg/machinery/pkg/log" "github.com/cloudnative-pg/machinery/pkg/log"
pgTime "github.com/cloudnative-pg/machinery/pkg/postgres/time" pgTime "github.com/cloudnative-pg/machinery/pkg/postgres/time"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/util/retry"
"sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client"
barmancloudv1 "github.com/cloudnative-pg/plugin-barman-cloud/api/v1" barmancloudv1 "github.com/cloudnative-pg/plugin-barman-cloud/api/v1"
@ -101,6 +102,13 @@ func (b BackupServiceImplementation) Backup(
postgres.BackupTemporaryDirectory, postgres.BackupTemporaryDirectory,
); err != nil { ); err != nil {
contextLogger.Error(err, "while taking backup") contextLogger.Error(err, "while taking backup")
if failureHandlerError := b.handleBackupError(ctx, configuration); failureHandlerError != nil {
contextLogger.Error(
failureHandlerError,
"Error while handling backup failure, skipping. "+
"BarmanObjectStore object may be not up to date.")
}
return nil, err return nil, err
} }
@ -166,3 +174,18 @@ func (b BackupServiceImplementation) Backup(
Metadata: newBackupResultMetadata(configuration.Cluster.ObjectMeta.UID, executedBackupInfo.TimeLine).toMap(), Metadata: newBackupResultMetadata(configuration.Cluster.ObjectMeta.UID, executedBackupInfo.TimeLine).toMap(),
}, nil }, nil
} }
func (b BackupServiceImplementation) handleBackupError(ctx context.Context, cfg *config.PluginConfiguration) error {
return retry.RetryOnConflict(
retry.DefaultBackoff,
func() error {
return setLastFailedBackupTime(
ctx,
b.Client,
cfg.GetBarmanObjectKey(),
cfg.ServerName,
time.Now(),
)
},
)
}

View File

@ -31,6 +31,7 @@ func buildFqName(name string) string {
var ( var (
firstRecoverabilityPointMetricName = buildFqName("first_recoverability_point") firstRecoverabilityPointMetricName = buildFqName("first_recoverability_point")
lastAvailableBackupTimestampMetricName = buildFqName("last_available_backup_timestamp") lastAvailableBackupTimestampMetricName = buildFqName("last_available_backup_timestamp")
lastFailedBackupTimestampMetricName = buildFqName("last_failed_backup_timestamp")
) )
func (m metricsImpl) GetCapabilities( func (m metricsImpl) GetCapabilities(
@ -72,6 +73,11 @@ func (m metricsImpl) Define(
Help: "The last available backup as a unix timestamp", Help: "The last available backup as a unix timestamp",
ValueType: &metrics.MetricType{Type: metrics.MetricType_TYPE_GAUGE}, ValueType: &metrics.MetricType{Type: metrics.MetricType_TYPE_GAUGE},
}, },
{
FqName: lastFailedBackupTimestampMetricName,
Help: "The last failed backup as a unix timestamp",
ValueType: &metrics.MetricType{Type: metrics.MetricType_TYPE_GAUGE},
},
}, },
}, nil }, nil
} }
@ -107,18 +113,26 @@ func (m metricsImpl) Collect(
FqName: lastAvailableBackupTimestampMetricName, FqName: lastAvailableBackupTimestampMetricName,
Value: 0, Value: 0,
}, },
{
FqName: lastFailedBackupTimestampMetricName,
Value: 0,
},
}, },
}, nil }, nil
} }
var firstRecoverabilityPoint float64 var firstRecoverabilityPoint float64
var lastAvailableBackup float64 var lastAvailableBackup float64
var lastFailedBackup float64
if x.FirstRecoverabilityPoint != nil { if x.FirstRecoverabilityPoint != nil {
firstRecoverabilityPoint = float64(x.FirstRecoverabilityPoint.Unix()) firstRecoverabilityPoint = float64(x.FirstRecoverabilityPoint.Unix())
} }
if x.LastSuccessfulBackupTime != nil { if x.LastSuccessfulBackupTime != nil {
lastAvailableBackup = float64(x.LastSuccessfulBackupTime.Unix()) lastAvailableBackup = float64(x.LastSuccessfulBackupTime.Unix())
} }
if x.LastFailedBackupTime != nil {
lastFailedBackup = float64(x.LastFailedBackupTime.Unix())
}
return &metrics.CollectMetricsResult{ return &metrics.CollectMetricsResult{
Metrics: []*metrics.CollectMetric{ Metrics: []*metrics.CollectMetric{
@ -130,6 +144,10 @@ func (m metricsImpl) Collect(
FqName: lastAvailableBackupTimestampMetricName, FqName: lastAvailableBackupTimestampMetricName,
Value: lastAvailableBackup, Value: lastAvailableBackup,
}, },
{
FqName: lastFailedBackupTimestampMetricName,
Value: lastFailedBackup,
},
}, },
}, nil }, nil
} }

View File

@ -6,6 +6,7 @@ import (
"github.com/cloudnative-pg/barman-cloud/pkg/catalog" "github.com/cloudnative-pg/barman-cloud/pkg/catalog"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/util/retry"
"k8s.io/utils/ptr" "k8s.io/utils/ptr"
"sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client"
@ -29,10 +30,9 @@ func updateRecoveryWindow(
return ptr.To(metav1.NewTime(*t)) return ptr.To(metav1.NewTime(*t))
} }
recoveryWindow := barmancloudv1.RecoveryWindow{ recoveryWindow := objectStore.Status.ServerRecoveryWindow[serverName]
FirstRecoverabilityPoint: convertTime(backupList.GetFirstRecoverabilityPoint()), recoveryWindow.FirstRecoverabilityPoint = convertTime(backupList.GetFirstRecoverabilityPoint())
LastSuccessfulBackupTime: convertTime(backupList.GetLastSuccessfulBackupTime()), recoveryWindow.LastSuccessfulBackupTime = convertTime(backupList.GetLastSuccessfulBackupTime())
}
if objectStore.Status.ServerRecoveryWindow == nil { if objectStore.Status.ServerRecoveryWindow == nil {
objectStore.Status.ServerRecoveryWindow = make(map[string]barmancloudv1.RecoveryWindow) objectStore.Status.ServerRecoveryWindow = make(map[string]barmancloudv1.RecoveryWindow)
@ -41,3 +41,25 @@ func updateRecoveryWindow(
return c.Status().Update(ctx, objectStore) return c.Status().Update(ctx, objectStore)
} }
// setLastFailedBackupTime sets the last failed backup time in the
// passed object store, for the passed server name.
func setLastFailedBackupTime(
ctx context.Context,
c client.Client,
objectStoreKey client.ObjectKey,
serverName string,
lastFailedBackupTime time.Time,
) error {
return retry.RetryOnConflict(retry.DefaultBackoff, func() error {
var objectStore barmancloudv1.ObjectStore
if err := c.Get(ctx, objectStoreKey, &objectStore); err != nil {
return err
}
recoveryWindow := objectStore.Status.ServerRecoveryWindow[serverName]
recoveryWindow.LastFailedBackupTime = ptr.To(metav1.NewTime(lastFailedBackupTime))
objectStore.Status.ServerRecoveryWindow[serverName] = recoveryWindow
return c.Status().Update(ctx, &objectStore)
})
}

View File

@ -101,5 +101,6 @@ _Appears in:_
| --- | --- | --- | --- | --- | | --- | --- | --- | --- | --- |
| `firstRecoverabilityPoint` _[Time](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.32/#time-v1-meta)_ | The first recoverability point in a PostgreSQL server refers to<br />the earliest point in time to which the database can be<br />restored. | True | | | | `firstRecoverabilityPoint` _[Time](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.32/#time-v1-meta)_ | The first recoverability point in a PostgreSQL server refers to<br />the earliest point in time to which the database can be<br />restored. | True | | |
| `lastSuccussfulBackupTime` _[Time](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.32/#time-v1-meta)_ | The last successful backup time | True | | | | `lastSuccussfulBackupTime` _[Time](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.32/#time-v1-meta)_ | The last successful backup time | True | | |
| `lastFailedBackupTime` _[Time](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.32/#time-v1-meta)_ | The last failed backup time | True | | |