Add separate helm values for the liveness and readiness probes

Signed-off-by: Tobias Giese <tgiese@nvidia.com>
2024-12-14 11:57:51 +00:00 · 2024-10-18 12:42:42 +02:00 · 2024-10-18 12:42:42 +02:00 · 52c2fc6498
commit 52c2fc6498
parent 901fbe2866
5 changed files with 226 additions and 109 deletions
--- a/deployment/helm/node-feature-discovery/templates/master.yaml
+++ b/deployment/helm/node-feature-discovery/templates/master.yaml
@ -48,9 +48,38 @@ spec:
          image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}"
          imagePullPolicy: {{ .Values.image.pullPolicy }}
          livenessProbe:
-            {{- toYaml .Values.master.livenessProbe | nindent 12 }}
+            grpc:
              port: {{ .Values.master.healthPort | default "8082" }}
          {{- with .Values.master.livenessProbe.initialDelaySeconds }}
            initialDelaySeconds: {{ . }}
          {{- end }}
          {{- with .Values.master.livenessProbe.failureThreshold }}
            failureThreshold: {{ . }}
          {{- end }}
          {{- with .Values.master.livenessProbe.periodSeconds }}
            periodSeconds: {{ . }}
          {{- end }}
          {{- with .Values.master.livenessProbe.timeoutSeconds }}
            timeoutSeconds: {{ . }}
          {{- end }}
          readinessProbe:
-            {{- toYaml .Values.master.readinessProbe | nindent 12 }}
+            grpc:
              port: {{ .Values.master.healthPort | default "8082" }}
          {{- with .Values.master.readinessProbe.initialDelaySeconds }}
            initialDelaySeconds: {{ . }}
          {{- end }}
          {{- with .Values.master.readinessProbe.failureThreshold }}
            failureThreshold: {{ . }}
          {{- end }}
          {{- with .Values.master.readinessProbe.periodSeconds }}
            periodSeconds: {{ . }}
          {{- end }}
          {{- with .Values.master.readinessProbe.timeoutSeconds }}
            timeoutSeconds: {{ . }}
          {{- end }}
          {{- with .Values.master.readinessProbe.successThreshold }}
            successThreshold: {{ . }}
          {{- end }}
          ports:
          - containerPort: {{ .Values.master.port | default "8080" }}
            name: grpc
--- a/deployment/helm/node-feature-discovery/templates/topologyupdater.yaml
+++ b/deployment/helm/node-feature-discovery/templates/topologyupdater.yaml
@ -45,9 +45,38 @@ spec:
        image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}"
        imagePullPolicy: "{{ .Values.image.pullPolicy }}"
        livenessProbe:
-          {{- toYaml .Values.topologyUpdater.livenessProbe | nindent 10 }}
+          grpc:
            port: {{ .Values.topologyUpdater.healthPort | default "8082" }}
        {{- with .Values.topologyUpdater.livenessProbe.initialDelaySeconds }}
          initialDelaySeconds: {{ . }}
        {{- end }}
        {{- with .Values.topologyUpdater.livenessProbe.failureThreshold }}
          failureThreshold: {{ . }}
        {{- end }}
        {{- with .Values.topologyUpdater.livenessProbe.periodSeconds }}
          periodSeconds: {{ . }}
        {{- end }}
        {{- with .Values.topologyUpdater.livenessProbe.timeoutSeconds }}
          timeoutSeconds: {{ . }}
        {{- end }}
        readinessProbe:
-          {{- toYaml .Values.topologyUpdater.readinessProbe | nindent 10 }}
+          grpc:
            port: {{ .Values.topologyUpdater.healthPort | default "8082" }}
        {{- with .Values.topologyUpdater.readinessProbe.initialDelaySeconds }}
          initialDelaySeconds: {{ . }}
        {{- end }}
        {{- with .Values.topologyUpdater.readinessProbe.failureThreshold }}
          failureThreshold: {{ . }}
        {{- end }}
        {{- with .Values.topologyUpdater.readinessProbe.periodSeconds }}
          periodSeconds: {{ . }}
        {{- end }}
        {{- with .Values.topologyUpdater.readinessProbe.timeoutSeconds }}
          timeoutSeconds: {{ . }}
        {{- end }}
        {{- with .Values.topologyUpdater.readinessProbe.successThreshold }}
          successThreshold: {{ . }}
        {{- end }}
        env:
        - name: NODE_NAME
          valueFrom:
--- a/deployment/helm/node-feature-discovery/templates/worker.yaml
+++ b/deployment/helm/node-feature-discovery/templates/worker.yaml
@ -47,9 +47,38 @@ spec:
        image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}"
        imagePullPolicy: {{ .Values.image.pullPolicy }}
        livenessProbe:
-          {{- toYaml .Values.worker.livenessProbe | nindent 12 }}
+          grpc:
            port: {{ .Values.worker.healthPort | default "8082" }}
        {{- with .Values.worker.livenessProbe.initialDelaySeconds }}
          initialDelaySeconds: {{ . }}
        {{- end }}
        {{- with .Values.worker.livenessProbe.failureThreshold }}
          failureThreshold: {{ . }}
        {{- end }}
        {{- with .Values.worker.livenessProbe.periodSeconds }}
          periodSeconds: {{ . }}
        {{- end }}
        {{- with .Values.worker.livenessProbe.timeoutSeconds }}
          timeoutSeconds: {{ . }}
        {{- end }}
        readinessProbe:
-          {{- toYaml .Values.worker.readinessProbe | nindent 12 }}
+          grpc:
            port: {{ .Values.worker.healthPort | default "8082" }}
        {{- with .Values.worker.readinessProbe.initialDelaySeconds }}
          initialDelaySeconds: {{ . }}
        {{- end }}
        {{- with .Values.worker.readinessProbe.failureThreshold }}
          failureThreshold: {{ . }}
        {{- end }}
        {{- with .Values.worker.readinessProbe.periodSeconds }}
          periodSeconds: {{ . }}
        {{- end }}
        {{- with .Values.worker.readinessProbe.timeoutSeconds }}
          timeoutSeconds: {{ . }}
        {{- end }}
        {{- with .Values.worker.readinessProbe.successThreshold }}
          successThreshold: {{ . }}
        {{- end }}
        env:
        - name: NODE_NAME
          valueFrom:
--- a/deployment/helm/node-feature-discovery/values.yaml
+++ b/deployment/helm/node-feature-discovery/values.yaml
@ -150,12 +150,15 @@ master:
    initialDelaySeconds: 10
    # failureThreshold: 3
    # periodSeconds: 10
    # timeoutSeconds: 1
  readinessProbe:
    grpc:
      port: 8082
    initialDelaySeconds: 5
    failureThreshold: 10
    # periodSeconds: 10
    # timeoutSeconds: 1
    # successThreshold: 1
 worker:
  enable: true
@ -426,12 +429,15 @@ worker:
    initialDelaySeconds: 10
    # failureThreshold: 3
    # periodSeconds: 10
    # timeoutSeconds: 1
  readinessProbe:
    grpc:
      port: 8082
    initialDelaySeconds: 5
    failureThreshold: 10
    # periodSeconds: 10
    # timeoutSeconds: 1
    # successThreshold: 1
  serviceAccount:
    # Specifies whether a service account should be created.
@ -520,12 +526,15 @@ topologyUpdater:
    initialDelaySeconds: 10
    # failureThreshold: 3
    # periodSeconds: 10
    # timeoutSeconds: 1
  readinessProbe:
    grpc:
      port: 8082
    initialDelaySeconds: 5
    failureThreshold: 10
    # periodSeconds: 10
    # timeoutSeconds: 1
    # successThreshold: 1
  resources:
    limits:
--- a/docs/deployment/helm.md
+++ b/docs/deployment/helm.md
@ -177,13 +177,13 @@ API's you need to install the prometheus operator in your cluster.
 ### Master pod parameters
 | Name                                        | Type    | Default                          | Description                                                                                                                                                                                           |
-|-------------------------------------|---------|-----------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+|---------------------------------------------|---------|----------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
 | `master.*`                                  | dict    |                                  | NFD master deployment configuration                                                                                                                                                                   |
 | `master.enable`                             | bool    | true                             | Specifies whether nfd-master should be deployed                                                                                                                                                       |
 | `master.hostNetwork`                        | bool    | false                            | Specifies whether to enable or disable running the container in the host's network namespace                                                                                                          |
 | `master.port`                               | integer |                                  | Specifies the TCP port that nfd-master listens for incoming requests. **NOTE**: this parameter is related to the deprecated gRPC API and will be removed with it in a future release                  |
 | `master.metricsPort`                        | integer | 8081                             | Port on which to expose metrics from components to prometheus operator                                                                                                                                |
-| `master.healthPort`                 | integer | 8082                                                                  | Port on which to expose the grpc health endpoint                                                                                                                                                      |
+| `master.healthPort`                         | integer | 8082                             | Port on which to expose the grpc health endpoint, will be also used for the probes                                                                                                                    |
 | `master.instance`                           | string  |                                  | Instance name. Used to separate annotation namespaces for multiple parallel deployments                                                                                                               |
 | `master.resyncPeriod`                       | string  |                                  | NFD API controller resync period.                                                                                                                                                                     |
 | `master.extraLabelNs`                       | array   | []                               | List of allowed extra label namespaces                                                                                                                                                                |
@ -211,8 +211,15 @@ API's you need to install the prometheus operator in your cluster.
 | `master.extraArgs`                          | array   | []                               | Additional [command line arguments](../reference/master-commandline-reference.md) to pass to nfd-master                                                                                               |
 | `master.extraEnvs`                          | array   | []                               | Additional environment variables to pass to nfd-master                                                                                                                                                |
 | `master.revisionHistoryLimit`               | integer |                                  | Specify how many old ReplicaSets for this Deployment you want to retain. [revisionHistoryLimit](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/#revision-history-limit)         |
-| `master.livenessProbe`              | dict    | {"grpc":{"port":8082},"initialDelaySeconds":10}                       | NFD master pod [liveness probe](https://kubernetes.io/docs/concepts/configuration/liveness-readiness-startup-probes/#liveness-probe)                                                                  |
+| `master.livenessProbe.initialDelaySeconds`  | integer | 10                               | Specifies the number of seconds after the container has started before liveness probes are initiated.                                                                                                 |
-| `master.readinessProbe`             | dict    | {"grpc":{"port":8082},"initialDelaySeconds":5,"failureThreshold": 10} | NFD master pod [readiness probe](https://kubernetes.io/docs/concepts/configuration/liveness-readiness-startup-probes/#readiness-probe)                                                                |
+| `master.livenessProbe.failureThreshold`     | integer | 3 (by Kubernetes)                | Specifies the number of consecutive failures of liveness probes before considering the pod as not ready.                                                                                              |
 | `master.livenessProbe.periodSeconds`        | integer | 10 (by Kubernetes)               | Specifies how often (in seconds) to perform the liveness probe.                                                                                                                                       |
 | `master.livenessProbe.timeoutSeconds`       | integer | 1 (by Kubernetes)                | Specifies the number of seconds after which the probe times out.                                                                                                                                      |
 | `master.readinessProbe.initialDelaySeconds` | integer | 5                                | Specifies the number of seconds after the container has started before readiness probes are initiated.                                                                                                |
 | `master.readinessProbe.failureThreshold`    | integer | 10                               | Specifies the number of consecutive failures of readiness probes before considering the pod as not ready.                                                                                             |
 | `master.readinessProbe.periodSeconds`       | integer | 10 (by Kubernetes)               | Specifies how often (in seconds) to perform the readiness probe.                                                                                                                                      |
 | `master.readinessProbe.timeoutSeconds`      | integer | 1 (by Kubernetes)                | Specifies the number of seconds after which the probe times out.                                                                                                                                      |
 | `master.readinessProbe.successThreshold`    | integer | 1 (by Kubernetes)                | Specifies the number of consecutive successes of readiness probes before considering the pod as ready.                                                                                                |
 > `[0]` Additional info for `master.resources.requests`: \
 > You may want to use the same value for `requests.memory` and `limits.memory`.
@ -228,12 +235,12 @@ API's you need to install the prometheus operator in your cluster.
 ### Worker pod parameters
 | Name                                        | Type    | Default                 | Description                                                                                                                                                                                                  |
-|-------------------------------------|---------|-----------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+|---------------------------------------------|---------|-------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
 | `worker.*`                                  | dict    |                         | NFD worker daemonset configuration                                                                                                                                                                           |
 | `worker.enable`                             | bool    | true                    | Specifies whether nfd-worker should be deployed                                                                                                                                                              |
 | `worker.hostNetwork`                        | bool    | false                   | Specifies whether to enable or disable running the container in the host's network namespace                                                                                                                 |
 | `worker.metricsPort`                        | int     | 8081                    | Port on which to expose metrics from components to prometheus operator                                                                                                                                       |
-| `worker.healthPort`                 | int     | 8082                                                                  | Port on which to expose the grpc health endpoint                                                                                                                                                             |
+| `worker.healthPort`                         | int     | 8082                    | Port on which to expose the grpc health endpoint, will be also used for the probes                                                                                                                           |
 | `worker.config`                             | dict    |                         | NFD worker [configuration](../reference/worker-configuration-reference)                                                                                                                                      |
 | `worker.podSecurityContext`                 | dict    | {}                      | [PodSecurityContext](https://kubernetes.io/docs/tasks/configure-pod-container/security-context/#set-the-security-context-for-a-pod) holds pod-level security attributes and common container settins         |
 | `worker.securityContext`                    | dict    | {}                      | Container [security settings](https://kubernetes.io/docs/tasks/configure-pod-container/security-context/#set-the-security-context-for-a-container)                                                           |
@ -252,13 +259,20 @@ API's you need to install the prometheus operator in your cluster.
 | `worker.extraArgs`                          | array   | []                      | Additional [command line arguments](../reference/worker-commandline-reference.md) to pass to nfd-worker                                                                                                      |
 | `worker.extraEnvs`                          | array   | []                      | Additional environment variables to pass to nfd-worker                                                                                                                                                       |
 | `worker.revisionHistoryLimit`               | integer |                         | Specify how many old ControllerRevisions for this DaemonSet you want to retain. [revisionHistoryLimit](https://kubernetes.io/docs/reference/kubernetes-api/workload-resources/daemon-set-v1/ #DaemonSetSpec) |
-| `worker.livenessProbe`              | dict    | {"grpc":{"port":8082},"initialDelaySeconds":10}                       | NFD worker pod [liveness probe](https://kubernetes.io/docs/concepts/configuration/liveness-readiness-startup-probes/#liveness-probe)                                                                         |
+| `worker.livenessProbe.initialDelaySeconds`  | integer | 10                      | Specifies the number of seconds after the container has started before liveness probes are initiated.                                                                                                        |
-| `worker.readinessProbe`             | dict    | {"grpc":{"port":8082},"initialDelaySeconds":5,"failureThreshold": 10} | NFD worker pod [readiness probe](https://kubernetes.io/docs/concepts/configuration/liveness-readiness-startup-probes/#readiness-probe)                                                                       |
+| `worker.livenessProbe.failureThreshold`     | integer | 3 (by Kubernetes)       | Specifies the number of consecutive failures of liveness probes before considering the pod as not ready.                                                                                                     |
 | `worker.livenessProbe.periodSeconds`        | integer | 10 (by Kubernetes)      | Specifies how often (in seconds) to perform the liveness probe.                                                                                                                                              |
 | `worker.livenessProbe.timeoutSeconds`       | integer | 1 (by Kubernetes)       | Specifies the number of seconds after which the probe times out.                                                                                                                                             |
 | `worker.readinessProbe.initialDelaySeconds` | integer | 5                       | Specifies the number of seconds after the container has started before readiness probes are initiated.                                                                                                       |
 | `worker.readinessProbe.failureThreshold`    | integer | 10                      | Specifies the number of consecutive failures of readiness probes before considering the pod as not ready.                                                                                                    |
 | `worker.readinessProbe.periodSeconds`       | integer | 10 (by Kubernetes)      | Specifies how often (in seconds) to perform the readiness probe.                                                                                                                                             |
 | `worker.readinessProbe.timeoutSeconds`      | integer | 1 (by Kubernetes)       | Specifies the number of seconds after which the probe times out.                                                                                                                                             |
 | `worker.readinessProbe.successThreshold`    | integer | 1 (by Kubernetes)       | Specifies the number of consecutive successes of readiness probes before considering the pod as ready.                                                                                                       |
 ### Topology updater parameters
 | Name                                                 | Type    | Default                  | Description                                                                                                                                                                                                 |
-|-----------------------------------------------|---------|-----------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+|------------------------------------------------------|---------|--------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
 | `topologyUpdater.*`                                  | dict    |                          | NFD Topology Updater configuration                                                                                                                                                                          |
 | `topologyUpdater.enable`                             | bool    | false                    | Specifies whether the NFD Topology Updater should be created                                                                                                                                                |
 | `topologyUpdater.hostNetwork`                        | bool    | false                    | Specifies whether to enable or disable running the container in the host's network namespace                                                                                                                |
@ -268,7 +282,7 @@ API's you need to install the prometheus operator in your cluster.
 | `topologyUpdater.serviceAccount.name`                | string  |                          | The name of the service account for topology updater to use. If not set and create is true, a name is generated using the fullname template and `-topology-updater` suffix                                  |
 | `topologyUpdater.rbac.create`                        | bool    | true                     | Specifies whether to create [RBAC][rbac] configuration for topology updater                                                                                                                                 |
 | `topologyUpdater.metricsPort`                        | integer | 8081                     | Port on which to expose prometheus metrics                                                                                                                                                                  |
-| `topologyUpdater.healthPort`                  | integer | 8082                                                                  | Port on which to expose the grpc health endpoint                                                                                                                                                            |
+| `topologyUpdater.healthPort`                         | integer | 8082                     | Port on which to expose the grpc health endpoint, will be also used for the probes                                                                                                                          |
 | `topologyUpdater.kubeletConfigPath`                  | string  | ""                       | Specifies the kubelet config host path                                                                                                                                                                      |
 | `topologyUpdater.kubeletPodResourcesSockPath`        | string  | ""                       | Specifies the kubelet sock path to read pod resources                                                                                                                                                       |
 | `topologyUpdater.updateInterval`                     | string  | 60s                      | Time to sleep between CR updates. Non-positive value implies no CR update.                                                                                                                                  |
@ -288,8 +302,15 @@ API's you need to install the prometheus operator in your cluster.
 | `topologyUpdater.extraArgs`                          | array   | []                       | Additional [command line arguments](../reference/topology-updater-commandline-reference.md) to pass to nfd-topology-updater                                                                                 |
 | `topologyUpdater.extraEnvs`                          | array   | []                       | Additional environment variables to pass to nfd-topology-updater                                                                                                                                            |
 | `topologyUpdater.revisionHistoryLimit`               | integer |                          | Specify how many old ControllerRevisions for this DaemonSet you want to retain. [revisionHistoryLimit](https://kubernetes.io/docs/reference/kubernetes-api/workload-resources/daemon-set-v1/#DaemonSetSpec) |
-| `topologyUpdater.livenessProbe`               | dict    | {"grpc":{"port":8082},"initialDelaySeconds":10}                       | Topology updater pod [liveness probe](https://kubernetes.io/docs/concepts/configuration/liveness-readiness-startup-probes/#liveness-probe)                                                                  |
+| `topologyUpdater.livenessProbe.initialDelaySeconds`  | integer | 10                       | Specifies the number of seconds after the container has started before liveness probes are initiated.                                                                                                       |
-| `topologyUpdater.readinessProbe`              | dict    | {"grpc":{"port":8082},"initialDelaySeconds":5,"failureThreshold": 10} | Topology updater pod [readiness probe](https://kubernetes.io/docs/concepts/configuration/liveness-readiness-startup-probes/#readiness-probe)                                                                |
+| `topologyUpdater.livenessProbe.failureThreshold`     | integer | 3 (by Kubernetes)        | Specifies the number of consecutive failures of liveness probes before considering the pod as not ready.                                                                                                    |
 | `topologyUpdater.livenessProbe.periodSeconds`        | integer | 10 (by Kubernetes)       | Specifies how often (in seconds) to perform the liveness probe.                                                                                                                                             |
 | `topologyUpdater.livenessProbe.timeoutSeconds`       | integer | 1 (by Kubernetes)        | Specifies the number of seconds after which the probe times out.                                                                                                                                            |
 | `topologyUpdater.readinessProbe.initialDelaySeconds` | integer | 5                        | Specifies the number of seconds after the container has started before readiness probes are initiated.                                                                                                      |
 | `topologyUpdater.readinessProbe.failureThreshold`    | integer | 10                       | Specifies the number of consecutive failures of readiness probes before considering the pod as not ready.                                                                                                   |
 | `topologyUpdater.readinessProbe.periodSeconds`       | integer | 10 (by Kubernetes)       | Specifies how often (in seconds) to perform the readiness probe.                                                                                                                                            |
 | `topologyUpdater.readinessProbe.timeoutSeconds`      | integer | 1 (by Kubernetes)        | Specifies the number of seconds after which the probe times out.                                                                                                                                            |
 | `topologyUpdater.readinessProbe.successThreshold`    | integer | 1 (by Kubernetes)        | Specifies the number of consecutive successes of readiness probes before considering the pod as ready.                                                                                                      |
 ### Garbage collector parameters