From ce7d8e9fbe787ddf83d5377e155cde3629914d35 Mon Sep 17 00:00:00 2001 From: Tudor Golubenco Date: Sun, 14 Sep 2025 10:22:33 -0700 Subject: [PATCH] Make also the liveness and readyness probes configurable Signed-off-by: Tudor Golubenco --- api/v1/objectstore_types.go | 8 ++ api/v1/zz_generated.deepcopy.go | 10 ++ .../barmancloud.cnpg.io_objectstores.yaml | 74 ++++++++++- hack/examples/minio-store.yaml | 12 ++ internal/cnpgi/operator/lifecycle.go | 120 ++++++++++++++---- internal/cnpgi/operator/lifecycle_probes.go | 78 +++++++++++- internal/cnpgi/operator/lifecycle_test.go | 66 ++++++++++ manifest.yaml | 108 ++++++++++++++++ 8 files changed, 442 insertions(+), 34 deletions(-) diff --git a/api/v1/objectstore_types.go b/api/v1/objectstore_types.go index 171eb61..e89e7c1 100644 --- a/api/v1/objectstore_types.go +++ b/api/v1/objectstore_types.go @@ -70,6 +70,14 @@ type InstanceSidecarConfiguration struct { // StartupProbe defines the configuration for the startup probe of the sidecar container. // +optional StartupProbe *ProbeConfig `json:"startupProbe,omitempty"` + + // LivenessProbe defines the configuration for the liveness probe of the sidecar container. + // +optional + LivenessProbe *ProbeConfig `json:"livenessProbe,omitempty"` + + // ReadinessProbe defines the configuration for the readiness probe of the sidecar container. + // +optional + ReadinessProbe *ProbeConfig `json:"readinessProbe,omitempty"` } // ObjectStoreSpec defines the desired state of ObjectStore. diff --git a/api/v1/zz_generated.deepcopy.go b/api/v1/zz_generated.deepcopy.go index 031dcbb..56ace98 100644 --- a/api/v1/zz_generated.deepcopy.go +++ b/api/v1/zz_generated.deepcopy.go @@ -41,6 +41,16 @@ func (in *InstanceSidecarConfiguration) DeepCopyInto(out *InstanceSidecarConfigu *out = new(ProbeConfig) **out = **in } + if in.LivenessProbe != nil { + in, out := &in.LivenessProbe, &out.LivenessProbe + *out = new(ProbeConfig) + **out = **in + } + if in.ReadinessProbe != nil { + in, out := &in.ReadinessProbe, &out.ReadinessProbe + *out = new(ProbeConfig) + **out = **in + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new InstanceSidecarConfiguration. diff --git a/config/crd/bases/barmancloud.cnpg.io_objectstores.yaml b/config/crd/bases/barmancloud.cnpg.io_objectstores.yaml index c4a9c1f..ad76f2c 100644 --- a/config/crd/bases/barmancloud.cnpg.io_objectstores.yaml +++ b/config/crd/bases/barmancloud.cnpg.io_objectstores.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.18.0 + controller-gen.kubebuilder.io/version: v0.16.1 name: objectstores.barmancloud.cnpg.io spec: group: barmancloud.cnpg.io @@ -511,6 +511,78 @@ spec: - name type: object type: array + livenessProbe: + description: LivenessProbe defines the configuration for the liveness + probe of the sidecar container. + properties: + failureThreshold: + default: 10 + description: FailureThreshold is the minimum consecutive failures + for the probe to be considered failed. + format: int32 + type: integer + initialDelaySeconds: + default: 0 + description: InitialDelaySeconds is the number of seconds + after the container has started before startup probes are + initiated. + format: int32 + type: integer + periodSeconds: + default: 10 + description: PeriodSeconds is how often (in seconds) to perform + the probe. + format: int32 + type: integer + successThreshold: + default: 1 + description: SuccessThreshold is the minimum consecutive successes + for the probe to be considered successful. + format: int32 + type: integer + timeoutSeconds: + default: 10 + description: TimeoutSeconds is the number of seconds after + which the probe times out. + format: int32 + type: integer + type: object + readinessProbe: + description: ReadinessProbe defines the configuration for the + readiness probe of the sidecar container. + properties: + failureThreshold: + default: 10 + description: FailureThreshold is the minimum consecutive failures + for the probe to be considered failed. + format: int32 + type: integer + initialDelaySeconds: + default: 0 + description: InitialDelaySeconds is the number of seconds + after the container has started before startup probes are + initiated. + format: int32 + type: integer + periodSeconds: + default: 10 + description: PeriodSeconds is how often (in seconds) to perform + the probe. + format: int32 + type: integer + successThreshold: + default: 1 + description: SuccessThreshold is the minimum consecutive successes + for the probe to be considered successful. + format: int32 + type: integer + timeoutSeconds: + default: 10 + description: TimeoutSeconds is the number of seconds after + which the probe times out. + format: int32 + type: integer + type: object resources: description: Resources define cpu/memory requests and limits for the sidecar that runs in the instance pods. diff --git a/hack/examples/minio-store.yaml b/hack/examples/minio-store.yaml index b36fb24..40943c1 100644 --- a/hack/examples/minio-store.yaml +++ b/hack/examples/minio-store.yaml @@ -19,6 +19,18 @@ spec: periodSeconds: 1 failureThreshold: 10 successThreshold: 1 + livenessProbe: + initialDelaySeconds: 30 + timeoutSeconds: 5 + periodSeconds: 10 + failureThreshold: 3 + successThreshold: 1 + readinessProbe: + initialDelaySeconds: 5 + timeoutSeconds: 5 + periodSeconds: 5 + failureThreshold: 3 + successThreshold: 1 configuration: endpointCA: name: minio-server-tls diff --git a/internal/cnpgi/operator/lifecycle.go b/internal/cnpgi/operator/lifecycle.go index 3e5299b..cd4b0dd 100644 --- a/internal/cnpgi/operator/lifecycle.go +++ b/internal/cnpgi/operator/lifecycle.go @@ -131,19 +131,33 @@ func (impl LifecycleImplementation) reconcileJob( return nil, err } + livenessProbe, err := impl.collectSidecarLivenessProbeForRecoveryJob(ctx, pluginConfiguration) + if err != nil { + return nil, err + } + + readinessProbe, err := impl.collectSidecarReadinessProbeForRecoveryJob(ctx, pluginConfiguration) + if err != nil { + return nil, err + } + return reconcileJob(ctx, cluster, request, sidecarConfiguration{ - env: env, - certificates: certificates, - resources: resources, - startupProbe: startupProbe, + env: env, + certificates: certificates, + resources: resources, + startupProbe: startupProbe, + livenessProbe: livenessProbe, + readinessProbe: readinessProbe, }) } type sidecarConfiguration struct { - env []corev1.EnvVar - certificates []corev1.VolumeProjection - resources corev1.ResourceRequirements - startupProbe *barmancloudv1.ProbeConfig + env []corev1.EnvVar + certificates []corev1.VolumeProjection + resources corev1.ResourceRequirements + startupProbe *barmancloudv1.ProbeConfig + livenessProbe *barmancloudv1.ProbeConfig + readinessProbe *barmancloudv1.ProbeConfig } func reconcileJob( @@ -230,11 +244,23 @@ func (impl LifecycleImplementation) reconcilePod( return nil, err } + livenessProbe, err := impl.collectSidecarLivenessProbeForInstancePod(ctx, pluginConfiguration) + if err != nil { + return nil, err + } + + readinessProbe, err := impl.collectSidecarReadinessProbeForInstancePod(ctx, pluginConfiguration) + if err != nil { + return nil, err + } + return reconcilePod(ctx, cluster, request, pluginConfiguration, sidecarConfiguration{ - env: env, - certificates: certificates, - resources: resources, - startupProbe: startupProbe, + env: env, + certificates: certificates, + resources: resources, + startupProbe: startupProbe, + livenessProbe: livenessProbe, + readinessProbe: readinessProbe, }) } @@ -325,25 +351,37 @@ func reconcilePodSpec( }, } - // Apply configurable probe settings if available - if config.startupProbe != nil { - // Copy timing and threshold settings from user configuration - baseProbe.InitialDelaySeconds = config.startupProbe.InitialDelaySeconds - baseProbe.TimeoutSeconds = config.startupProbe.TimeoutSeconds - baseProbe.PeriodSeconds = config.startupProbe.PeriodSeconds - baseProbe.FailureThreshold = config.startupProbe.FailureThreshold - baseProbe.SuccessThreshold = config.startupProbe.SuccessThreshold - } else { - // Fallback to default values - baseProbe.FailureThreshold = 10 - baseProbe.TimeoutSeconds = 10 - } + startupProbe := createProbe(baseProbe, config.startupProbe, &barmancloudv1.ProbeConfig{ + FailureThreshold: 10, + TimeoutSeconds: 10, + InitialDelaySeconds: 0, + SuccessThreshold: 1, + PeriodSeconds: 10, + }) + + livenessProbe := createProbe(baseProbe, config.livenessProbe, &barmancloudv1.ProbeConfig{ + FailureThreshold: 3, + TimeoutSeconds: 10, + InitialDelaySeconds: 0, + SuccessThreshold: 1, + PeriodSeconds: 10, + }) + + readinessProbe := createProbe(baseProbe, config.readinessProbe, &barmancloudv1.ProbeConfig{ + FailureThreshold: 3, + TimeoutSeconds: 10, + InitialDelaySeconds: 0, + SuccessThreshold: 1, + PeriodSeconds: 10, + }) // fixed values sidecarTemplate.Name = "plugin-barman-cloud" sidecarTemplate.Image = viper.GetString("sidecar-image") sidecarTemplate.ImagePullPolicy = cluster.Spec.ImagePullPolicy - sidecarTemplate.StartupProbe = baseProbe.DeepCopy() + sidecarTemplate.StartupProbe = startupProbe + sidecarTemplate.LivenessProbe = livenessProbe + sidecarTemplate.ReadinessProbe = readinessProbe sidecarTemplate.SecurityContext = &corev1.SecurityContext{ AllowPrivilegeEscalation: ptr.To(false), RunAsNonRoot: ptr.To(true), @@ -567,3 +605,33 @@ func getCNPGJobRole(job *batchv1.Job) string { return "" } + +// createProbe creates a probe using the base probe's handler and applies configuration or default values +func createProbe(baseProbe *corev1.Probe, config *barmancloudv1.ProbeConfig, defaults *barmancloudv1.ProbeConfig) *corev1.Probe { + probe := baseProbe.DeepCopy() + probe.FailureThreshold = defaults.FailureThreshold + probe.TimeoutSeconds = defaults.TimeoutSeconds + probe.InitialDelaySeconds = defaults.InitialDelaySeconds + probe.SuccessThreshold = defaults.SuccessThreshold + probe.PeriodSeconds = defaults.PeriodSeconds + + if config != nil { + if config.InitialDelaySeconds != 0 { + probe.InitialDelaySeconds = config.InitialDelaySeconds + } + if config.TimeoutSeconds != 0 { + probe.TimeoutSeconds = config.TimeoutSeconds + } + if config.PeriodSeconds != 0 { + probe.PeriodSeconds = config.PeriodSeconds + } + if config.SuccessThreshold != 0 { + probe.SuccessThreshold = config.SuccessThreshold + } + if config.FailureThreshold != 0 { + probe.FailureThreshold = config.FailureThreshold + } + } + + return probe +} diff --git a/internal/cnpgi/operator/lifecycle_probes.go b/internal/cnpgi/operator/lifecycle_probes.go index 4bee0df..fc4c345 100644 --- a/internal/cnpgi/operator/lifecycle_probes.go +++ b/internal/cnpgi/operator/lifecycle_probes.go @@ -7,9 +7,14 @@ import ( "github.com/cloudnative-pg/plugin-barman-cloud/internal/cnpgi/operator/config" ) -func (impl LifecycleImplementation) collectSidecarStartupProbeForRecoveryJob( +// probeAccessor is a function type that extracts a specific probe configuration from an ObjectStore +type probeAccessor func(*barmancloudv1.ObjectStore) *barmancloudv1.ProbeConfig + +// collectSidecarProbeForRecoveryJob is a generic function to collect probe configurations for recovery jobs +func (impl LifecycleImplementation) collectSidecarProbeForRecoveryJob( ctx context.Context, configuration *config.PluginConfiguration, + accessor probeAccessor, ) (*barmancloudv1.ProbeConfig, error) { if len(configuration.RecoveryBarmanObjectName) > 0 { var barmanObjectStore barmancloudv1.ObjectStore @@ -17,43 +22,102 @@ func (impl LifecycleImplementation) collectSidecarStartupProbeForRecoveryJob( return nil, err } - return barmanObjectStore.Spec.InstanceSidecarConfiguration.StartupProbe, nil + return accessor(&barmanObjectStore), nil } return nil, nil } -func (impl LifecycleImplementation) collectSidecarStartupProbeForInstancePod( +// collectSidecarProbeForInstancePod is a generic function to collect probe configurations for instance pods +func (impl LifecycleImplementation) collectSidecarProbeForInstancePod( ctx context.Context, configuration *config.PluginConfiguration, + accessor probeAccessor, + probeType string, ) (*barmancloudv1.ProbeConfig, error) { if len(configuration.BarmanObjectName) > 0 { // On a replica cluster that also archives, the designated primary // will use both the replica source object store and the object store // of the cluster. // In this case, we use the cluster object store for configuring - // the startup probe of the sidecar container. + // the probe of the sidecar container. var barmanObjectStore barmancloudv1.ObjectStore if err := impl.Client.Get(ctx, configuration.GetBarmanObjectKey(), &barmanObjectStore); err != nil { return nil, err } - return barmanObjectStore.Spec.InstanceSidecarConfiguration.StartupProbe, nil + return accessor(&barmanObjectStore), nil } if len(configuration.RecoveryBarmanObjectName) > 0 { // On a replica cluster that doesn't archive, the designated primary // uses only the replica source object store. // In this case, we use the replica source object store for configuring - // the startup probe of the sidecar container. + // the probe of the sidecar container. var barmanObjectStore barmancloudv1.ObjectStore if err := impl.Client.Get(ctx, configuration.GetRecoveryBarmanObjectKey(), &barmanObjectStore); err != nil { return nil, err } - return barmanObjectStore.Spec.InstanceSidecarConfiguration.StartupProbe, nil + return accessor(&barmanObjectStore), nil } return nil, nil } + +// Specific probe collection methods that use the generic functions + +func (impl LifecycleImplementation) collectSidecarStartupProbeForRecoveryJob( + ctx context.Context, + configuration *config.PluginConfiguration, +) (*barmancloudv1.ProbeConfig, error) { + return impl.collectSidecarProbeForRecoveryJob(ctx, configuration, func(store *barmancloudv1.ObjectStore) *barmancloudv1.ProbeConfig { + return store.Spec.InstanceSidecarConfiguration.StartupProbe + }) +} + +func (impl LifecycleImplementation) collectSidecarStartupProbeForInstancePod( + ctx context.Context, + configuration *config.PluginConfiguration, +) (*barmancloudv1.ProbeConfig, error) { + return impl.collectSidecarProbeForInstancePod(ctx, configuration, func(store *barmancloudv1.ObjectStore) *barmancloudv1.ProbeConfig { + return store.Spec.InstanceSidecarConfiguration.StartupProbe + }, "startup") +} + +func (impl LifecycleImplementation) collectSidecarLivenessProbeForRecoveryJob( + ctx context.Context, + configuration *config.PluginConfiguration, +) (*barmancloudv1.ProbeConfig, error) { + return impl.collectSidecarProbeForRecoveryJob(ctx, configuration, func(store *barmancloudv1.ObjectStore) *barmancloudv1.ProbeConfig { + return store.Spec.InstanceSidecarConfiguration.LivenessProbe + }) +} + +func (impl LifecycleImplementation) collectSidecarLivenessProbeForInstancePod( + ctx context.Context, + configuration *config.PluginConfiguration, +) (*barmancloudv1.ProbeConfig, error) { + return impl.collectSidecarProbeForInstancePod(ctx, configuration, func(store *barmancloudv1.ObjectStore) *barmancloudv1.ProbeConfig { + return store.Spec.InstanceSidecarConfiguration.LivenessProbe + }, "liveness") +} + +func (impl LifecycleImplementation) collectSidecarReadinessProbeForRecoveryJob( + ctx context.Context, + configuration *config.PluginConfiguration, +) (*barmancloudv1.ProbeConfig, error) { + return impl.collectSidecarProbeForRecoveryJob(ctx, configuration, func(store *barmancloudv1.ObjectStore) *barmancloudv1.ProbeConfig { + return store.Spec.InstanceSidecarConfiguration.ReadinessProbe + }) +} + +func (impl LifecycleImplementation) collectSidecarReadinessProbeForInstancePod( + ctx context.Context, + configuration *config.PluginConfiguration, +) (*barmancloudv1.ProbeConfig, error) { + return impl.collectSidecarProbeForInstancePod(ctx, configuration, func(store *barmancloudv1.ObjectStore) *barmancloudv1.ProbeConfig { + return store.Spec.InstanceSidecarConfiguration.ReadinessProbe + }, "readiness") +} diff --git a/internal/cnpgi/operator/lifecycle_test.go b/internal/cnpgi/operator/lifecycle_test.go index 3653201..f7b6dcd 100644 --- a/internal/cnpgi/operator/lifecycle_test.go +++ b/internal/cnpgi/operator/lifecycle_test.go @@ -220,6 +220,72 @@ var _ = Describe("LifecycleImplementation", func() { Expect(string(response.JsonPatch)).To(ContainSubstring("\"successThreshold\":1")) }) + It("decouples probe configurations - startupProbe doesn't affect other probes", func(ctx SpecContext) { + // Configure only startupProbe with custom settings + startupProbeConfig := &barmancloudv1.ProbeConfig{ + InitialDelaySeconds: 5, + TimeoutSeconds: 20, + PeriodSeconds: 3, + FailureThreshold: 8, + SuccessThreshold: 2, + } + + pod := &corev1.Pod{ + TypeMeta: podTypeMeta, + ObjectMeta: metav1.ObjectMeta{ + Name: "test-pod", + }, + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "postgres", + }, + }, + }, + } + + podJSON, err := json.Marshal(pod) + Expect(err).NotTo(HaveOccurred()) + + request := &lifecycle.OperatorLifecycleRequest{ + ObjectDefinition: podJSON, + } + + response, err := reconcilePod(ctx, cluster, request, pluginConfiguration, sidecarConfiguration{ + startupProbe: startupProbeConfig, + // livenessProbe and readinessProbe are nil - should use defaults + }) + Expect(err).NotTo(HaveOccurred()) + Expect(response).NotTo(BeNil()) + Expect(response.JsonPatch).NotTo(BeEmpty()) + + patchStr := string(response.JsonPatch) + + // Verify startupProbe has custom settings + Expect(patchStr).To(ContainSubstring("startupProbe")) + Expect(patchStr).To(ContainSubstring("\"initialDelaySeconds\":5")) + Expect(patchStr).To(ContainSubstring("\"timeoutSeconds\":20")) + Expect(patchStr).To(ContainSubstring("\"periodSeconds\":3")) + Expect(patchStr).To(ContainSubstring("\"failureThreshold\":8")) + Expect(patchStr).To(ContainSubstring("\"successThreshold\":2")) + + // Verify livenessProbe has default settings (not affected by startupProbe) + Expect(patchStr).To(ContainSubstring("livenessProbe")) + Expect(patchStr).To(ContainSubstring("\"failureThreshold\":3")) // default for liveness + Expect(patchStr).To(ContainSubstring("\"timeoutSeconds\":10")) // default for liveness + // initialDelaySeconds: 0 is omitted from JSON when it's the zero value + + // Verify readinessProbe has default settings (not affected by startupProbe) + Expect(patchStr).To(ContainSubstring("readinessProbe")) + Expect(patchStr).To(ContainSubstring("\"failureThreshold\":3")) // default for readiness + Expect(patchStr).To(ContainSubstring("\"timeoutSeconds\":10")) // default for readiness + // initialDelaySeconds: 0 is omitted from JSON when it's the zero value + + // Verify that livenessProbe and readinessProbe don't have startupProbe values + Expect(patchStr).NotTo(MatchRegexp(`"livenessProbe"[^}]*"initialDelaySeconds":5`)) + Expect(patchStr).NotTo(MatchRegexp(`"readinessProbe"[^}]*"initialDelaySeconds":5`)) + }) + It("returns a patch for a valid pod", func(ctx SpecContext) { pod := &corev1.Pod{ TypeMeta: podTypeMeta, diff --git a/manifest.yaml b/manifest.yaml index 7248ce9..84ed5bc 100644 --- a/manifest.yaml +++ b/manifest.yaml @@ -510,6 +510,78 @@ spec: - name type: object type: array + livenessProbe: + description: LivenessProbe defines the configuration for the liveness + probe of the sidecar container. + properties: + failureThreshold: + default: 10 + description: FailureThreshold is the minimum consecutive failures + for the probe to be considered failed. + format: int32 + type: integer + initialDelaySeconds: + default: 0 + description: InitialDelaySeconds is the number of seconds + after the container has started before startup probes are + initiated. + format: int32 + type: integer + periodSeconds: + default: 10 + description: PeriodSeconds is how often (in seconds) to perform + the probe. + format: int32 + type: integer + successThreshold: + default: 1 + description: SuccessThreshold is the minimum consecutive successes + for the probe to be considered successful. + format: int32 + type: integer + timeoutSeconds: + default: 10 + description: TimeoutSeconds is the number of seconds after + which the probe times out. + format: int32 + type: integer + type: object + readinessProbe: + description: ReadinessProbe defines the configuration for the + readiness probe of the sidecar container. + properties: + failureThreshold: + default: 10 + description: FailureThreshold is the minimum consecutive failures + for the probe to be considered failed. + format: int32 + type: integer + initialDelaySeconds: + default: 0 + description: InitialDelaySeconds is the number of seconds + after the container has started before startup probes are + initiated. + format: int32 + type: integer + periodSeconds: + default: 10 + description: PeriodSeconds is how often (in seconds) to perform + the probe. + format: int32 + type: integer + successThreshold: + default: 1 + description: SuccessThreshold is the minimum consecutive successes + for the probe to be considered successful. + format: int32 + type: integer + timeoutSeconds: + default: 10 + description: TimeoutSeconds is the number of seconds after + which the probe times out. + format: int32 + type: integer + type: object resources: description: Resources define cpu/memory requests and limits for the sidecar that runs in the instance pods. @@ -576,6 +648,42 @@ spec: The retentionCheckInterval defines the frequency at which the system checks and enforces retention policies. type: integer + startupProbe: + description: StartupProbe defines the configuration for the startup + probe of the sidecar container. + properties: + failureThreshold: + default: 10 + description: FailureThreshold is the minimum consecutive failures + for the probe to be considered failed. + format: int32 + type: integer + initialDelaySeconds: + default: 0 + description: InitialDelaySeconds is the number of seconds + after the container has started before startup probes are + initiated. + format: int32 + type: integer + periodSeconds: + default: 10 + description: PeriodSeconds is how often (in seconds) to perform + the probe. + format: int32 + type: integer + successThreshold: + default: 1 + description: SuccessThreshold is the minimum consecutive successes + for the probe to be considered successful. + format: int32 + type: integer + timeoutSeconds: + default: 10 + description: TimeoutSeconds is the number of seconds after + which the probe times out. + format: int32 + type: integer + type: object type: object retentionPolicy: description: |-