From 5fd9449b27394756e0baf76b1356900850f687a6 Mon Sep 17 00:00:00 2001 From: Armando Ruocco Date: Mon, 2 Dec 2024 14:51:32 +0100 Subject: [PATCH] feat: add `liveness` and `readiness` probe support (#69) Signed-off-by: Armando Ruocco Signed-off-by: Leonardo Cecchi Signed-off-by: Francesco Canovai Co-authored-by: Leonardo Cecchi Co-authored-by: Francesco Canovai --- cmd/manager/main.go | 2 + internal/cmd/healthcheck/doc.go | 2 + internal/cmd/healthcheck/main.go | 75 ++++++++++++++++++++++++++++ internal/cnpgi/common/health.go | 28 +++++++++++ internal/cnpgi/instance/start.go | 1 + internal/cnpgi/operator/lifecycle.go | 11 ++++ internal/cnpgi/restore/start.go | 3 ++ kubernetes/deployment.yaml | 5 ++ 8 files changed, 127 insertions(+) create mode 100644 internal/cmd/healthcheck/doc.go create mode 100644 internal/cmd/healthcheck/main.go create mode 100644 internal/cnpgi/common/health.go diff --git a/cmd/manager/main.go b/cmd/manager/main.go index 6f37745..cafdb9f 100644 --- a/cmd/manager/main.go +++ b/cmd/manager/main.go @@ -10,6 +10,7 @@ import ( "github.com/spf13/cobra" ctrl "sigs.k8s.io/controller-runtime" + "github.com/cloudnative-pg/plugin-barman-cloud/internal/cmd/healthcheck" "github.com/cloudnative-pg/plugin-barman-cloud/internal/cmd/instance" "github.com/cloudnative-pg/plugin-barman-cloud/internal/cmd/operator" "github.com/cloudnative-pg/plugin-barman-cloud/internal/cmd/restore" @@ -32,6 +33,7 @@ func main() { rootCmd.AddCommand(instance.NewCmd()) rootCmd.AddCommand(operator.NewCmd()) rootCmd.AddCommand(restore.NewCmd()) + rootCmd.AddCommand(healthcheck.NewCmd()) if err := rootCmd.ExecuteContext(ctrl.SetupSignalHandler()); err != nil { if !errors.Is(err, context.Canceled) { diff --git a/internal/cmd/healthcheck/doc.go b/internal/cmd/healthcheck/doc.go new file mode 100644 index 0000000..fdc2150 --- /dev/null +++ b/internal/cmd/healthcheck/doc.go @@ -0,0 +1,2 @@ +// Package healthcheck contains the logic to execute an healthcheck on the plugin through a command +package healthcheck diff --git a/internal/cmd/healthcheck/main.go b/internal/cmd/healthcheck/main.go new file mode 100644 index 0000000..efc86a9 --- /dev/null +++ b/internal/cmd/healthcheck/main.go @@ -0,0 +1,75 @@ +package healthcheck + +import ( + "fmt" + "os" + "path" + + "github.com/cloudnative-pg/machinery/pkg/log" + "github.com/spf13/cobra" + "google.golang.org/grpc" + "google.golang.org/grpc/credentials/insecure" + "google.golang.org/grpc/health/grpc_health_v1" + + "github.com/cloudnative-pg/plugin-barman-cloud/internal/cnpgi/metadata" +) + +// NewCmd returns the healthcheck command +func NewCmd() *cobra.Command { + cmd := &cobra.Command{ + Use: "healthcheck", + Short: "healthcheck commands", + } + + cmd.AddCommand(unixHealthCheck()) + + return cmd +} + +func unixHealthCheck() *cobra.Command { + cmd := &cobra.Command{ + Use: "unix", + Short: "executes the health check command on unix:///plugins/barman-cloud.cloudnative-pg.io", + RunE: func(cmd *cobra.Command, _ []string) error { + dialPath := fmt.Sprintf("unix://%s", path.Join("/plugins", metadata.PluginName)) + cli, cliErr := grpc.NewClient(dialPath, grpc.WithTransportCredentials(insecure.NewCredentials())) + if cliErr != nil { + log.Error(cliErr, "while building the client") + return cliErr + } + + healthCli := grpc_health_v1.NewHealthClient(cli) + res, healthErr := healthCli.Check( + cmd.Context(), + &grpc_health_v1.HealthCheckRequest{}, + ) + if healthErr != nil { + log.Error(healthErr, "while executing the healthcheck call") + return healthErr + } + + if res.Status == grpc_health_v1.HealthCheckResponse_SERVING { + log.Trace("healthcheck response OK") + os.Exit(0) + return nil + } + + log.Error(fmt.Errorf("unexpected healthcheck status: %v", res.Status), + "while processing healthcheck response") + + // exit code 1 is returned when we exit from the function with an error + switch res.Status { + case grpc_health_v1.HealthCheckResponse_UNKNOWN: + os.Exit(2) + case grpc_health_v1.HealthCheckResponse_NOT_SERVING: + os.Exit(3) + default: + os.Exit(125) + } + + return nil + }, + } + + return cmd +} diff --git a/internal/cnpgi/common/health.go b/internal/cnpgi/common/health.go new file mode 100644 index 0000000..9eda587 --- /dev/null +++ b/internal/cnpgi/common/health.go @@ -0,0 +1,28 @@ +package common + +import ( + "context" + + "github.com/cloudnative-pg/machinery/pkg/log" + "google.golang.org/grpc" + "google.golang.org/grpc/health/grpc_health_v1" +) + +// AddHealthCheck adds a health check service to the gRPC server with the tag 'plugin-barman-cloud' +func AddHealthCheck(server *grpc.Server) { + grpc_health_v1.RegisterHealthServer(server, &healthServer{}) // replaces default registration +} + +type healthServer struct { + grpc_health_v1.UnimplementedHealthServer +} + +// Check is the response handle for the healthcheck request +func (h healthServer) Check( + ctx context.Context, + _ *grpc_health_v1.HealthCheckRequest, +) (*grpc_health_v1.HealthCheckResponse, error) { + contextLogger := log.FromContext(ctx) + contextLogger.Trace("serving health check response") + return &grpc_health_v1.HealthCheckResponse{Status: grpc_health_v1.HealthCheckResponse_SERVING}, nil +} diff --git a/internal/cnpgi/instance/start.go b/internal/cnpgi/instance/start.go index ce1c6d3..5c4a319 100644 --- a/internal/cnpgi/instance/start.go +++ b/internal/cnpgi/instance/start.go @@ -46,6 +46,7 @@ func (c *CNPGI) Start(ctx context.Context) error { ClusterObjectKey: c.ClusterObjectKey, InstanceName: c.InstanceName, }) + common.AddHealthCheck(server) return nil } diff --git a/internal/cnpgi/operator/lifecycle.go b/internal/cnpgi/operator/lifecycle.go index acceb16..c192967 100644 --- a/internal/cnpgi/operator/lifecycle.go +++ b/internal/cnpgi/operator/lifecycle.go @@ -235,10 +235,21 @@ func reconcilePodSpec( }, } + baseProbe := &corev1.Probe{ + FailureThreshold: 3, + ProbeHandler: corev1.ProbeHandler{ + Exec: &corev1.ExecAction{ + Command: []string{"manager", "healthcheck", "unix"}, + }, + }, + } + // fixed values sidecarConfig.Name = "plugin-barman-cloud" sidecarConfig.Image = viper.GetString("sidecar-image") sidecarConfig.ImagePullPolicy = cluster.Spec.ImagePullPolicy + sidecarConfig.LivenessProbe = baseProbe.DeepCopy() + sidecarConfig.StartupProbe = baseProbe.DeepCopy() // merge the main container envs if they aren't already set for _, container := range spec.Containers { diff --git a/internal/cnpgi/restore/start.go b/internal/cnpgi/restore/start.go index 2d5411f..aef1449 100644 --- a/internal/cnpgi/restore/start.go +++ b/internal/cnpgi/restore/start.go @@ -49,6 +49,9 @@ func (c *CNPGI) Start(ctx context.Context) error { PgDataPath: c.PGDataPath, PgWalFolderToSymlink: PgWalVolumePgWalPath, }) + + common.AddHealthCheck(server) + return nil } diff --git a/kubernetes/deployment.yaml b/kubernetes/deployment.yaml index dfc299a..3d09adb 100644 --- a/kubernetes/deployment.yaml +++ b/kubernetes/deployment.yaml @@ -37,6 +37,11 @@ spec: - --server-address=:9090 - --leader-elect - --log-level=debug + readinessProbe: + tcpSocket: + port: 9090 + initialDelaySeconds: 10 + periodSeconds: 10 volumeMounts: - mountPath: /server name: server