1
0
Fork 0
mirror of https://github.com/kubernetes-sigs/node-feature-discovery.git synced 2024-12-14 11:57:51 +00:00

Enable metrics via prometheus operator

Expose metrics via prometheus.monitoring.coreos.com/v1

The exposed metrics are

| Metric        | Type | Meaning |
| --------------- | ---------------- | ---------------- |
|  `nfd_master_build_info`           | Gauge | Version from which nfd-master was built. |
|  `nfd_worker_build_info`           | Gauge | Version from which nfd-worker was built. |
|  `nfd_updated_nodes`           |  Counter | Time taken to label a node |
|  `nfd_crd_processing_time`          |  Gauge | Time taken to process a NodeFeatureRule CRD |
| `nfd_feature_discovery_duration_seconds` |  HistogramVec | Time taken to discover features on a node |

Signed-off-by: Carlos Eduardo Arango Gutierrez <eduardoa@nvidia.com>
Co-authored-by: Markus Lehtonen <markus.lehtonen@intel.com>
This commit is contained in:
Carlos Eduardo Arango Gutierrez 2023-06-06 16:39:02 +02:00
parent f02d172d07
commit e3aedd33e2
No known key found for this signature in database
GPG key ID: 5697017E44D90737
19 changed files with 344 additions and 23 deletions

View file

@ -124,6 +124,8 @@ func initFlags(flagset *flag.FlagSet) (*master.Args, *master.ConfigOverrideArgs)
"Enable NFD CRD API controller for processing NodeFeature and NodeFeatureRule objects.")
flagset.IntVar(&args.Port, "port", 8080,
"Port on which to listen for connections.")
flagset.IntVar(&args.MetricsPort, "metrics", 8081,
"Port on which to expose metrics.")
flagset.BoolVar(&args.Prune, "prune", false,
"Prune all NFD related attributes from all nodes of the cluster and exit.")
flagset.BoolVar(&args.VerifyNodeName, "verify-node-name", false,

View file

@ -109,6 +109,8 @@ func initFlags(flagset *flag.FlagSet) (*worker.Args, *worker.ConfigOverrideArgs)
"Kubeconfig to use")
flagset.BoolVar(&args.Oneshot, "oneshot", false,
"Do not publish feature labels")
flagset.IntVar(&args.MetricsPort, "metrics", 8081,
"Port on which to expose metrics.")
flagset.StringVar(&args.Options, "options", "",
"Specify config options from command line. Config options are specified "+
"in the same format as in the config file (i.e. json or yaml). These options")

View file

@ -35,5 +35,7 @@ spec:
command:
- "nfd-master"
ports:
- name: metrics
containerPort: 8081
- name: grpc
containerPort: 8080

View file

@ -23,3 +23,6 @@ spec:
- "nfd-worker"
args:
- "-server=nfd-master:8080"
ports:
- name: metrics
containerPort: 8081

View file

@ -66,6 +66,8 @@ spec:
ports:
- containerPort: {{ .Values.master.port | default "8080" }}
name: grpc
- containerPort: {{ .Values.master.metricsPort | default "8081" }}
name: metrics
env:
- name: NODE_NAME
valueFrom:
@ -118,6 +120,7 @@ spec:
- "-key-file=/etc/kubernetes/node-feature-discovery/certs/tls.key"
- "-cert-file=/etc/kubernetes/node-feature-discovery/certs/tls.crt"
{{- end }}
- "-metrics={{ .Values.master.metricsPort | default "8081" }}"
volumeMounts:
{{- if .Values.tls.enable }}
- name: nfd-master-cert
@ -139,7 +142,6 @@ spec:
items:
- key: nfd-master.conf
path: nfd-master.conf
{{- with .Values.master.nodeSelector }}
nodeSelector:
{{- toYaml . | nindent 8 }}

View file

@ -0,0 +1,23 @@
{{- if .Values.prometheus.enable }}
# Prometheus Monitor Service (Metrics)
apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
name: {{ include "node-feature-discovery.fullname" . }}
labels:
{{- include "node-feature-discovery.selectorLabels" . | nindent 4 }}
spec:
podMetricsEndpoints:
- honorLabels: true
interval: 10s
path: /metrics
port: metrics
scheme: http
namespaceSelector:
matchNames:
- {{ include "node-feature-discovery.namespace" . }}
selector:
matchExpressions:
- {key: app.kubernetes.io/instance, operator: In, values: ["{{ .Release.Name }}"]}
- {key: app.kubernetes.io/name, operator: In, values: ["{{ include "node-feature-discovery.name" . }}"]}
{{- end }}

View file

@ -54,6 +54,10 @@ spec:
- "-key-file=/etc/kubernetes/node-feature-discovery/certs/tls.key"
- "-cert-file=/etc/kubernetes/node-feature-discovery/certs/tls.crt"
{{- end }}
- "-metrics={{ .Values.worker.metricsPort | default "8081"}}"
ports:
- name: metrics
containerPort: {{ .Values.worker.metricsPort | default "8081"}}
volumeMounts:
- name: host-boot
mountPath: "/host-boot"

View file

@ -31,6 +31,7 @@ master:
### <NFD-MASTER-CONF-END-DO-NOT-REMOVE>
# The TCP port that nfd-master listens for incoming requests. Default: 8080
port: 8080
metricsPort: 8081
instance:
featureApi:
resyncPeriod:
@ -343,6 +344,7 @@ worker:
#
### <NFD-WORKER-CONF-END-DO-NOT-REMOVE>
metricsPort: 8081
daemonsetAnnotations: {}
podSecurityContext: {}
# fsGroup: 2000
@ -493,3 +495,6 @@ topologyGC:
tls:
enable: false
certManager: false
prometheus:
enable: false

View file

@ -0,0 +1,7 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: node-feature-discovery
resources:
- monitor.yaml

View file

@ -0,0 +1,20 @@
# Prometheus Monitor Service (Metrics)
apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
name: nfd-metrics
labels:
app: nfd
spec:
podMetricsEndpoints:
- honorLabels: true
interval: 10s
path: /metrics
port: metrics
scheme: http
namespaceSelector:
matchNames:
- node-feature-discovery
selector:
matchExpressions:
- {key: app, operator: In, values: ["nfd-master", "nfd-worker"]}

View file

@ -102,6 +102,11 @@ We have introduced the following Chart parameters.
| `tls.enable` | bool | false | Specifies whether to use TLS for communications between components |
| `tls.certManager` | bool | false | If enabled, requires [cert-manager](https://cert-manager.io/docs/) to be installed and will automatically create the required TLS certificates |
| `enableNodeFeatureApi` | bool | false | Enable the [NodeFeature](../usage/custom-resources.md#nodefeature) CRD API for communicating node features. This will automatically disable the gRPC communication.
| `prometheus.enable` | bool | false | Specifies whether to expose metrics using prometheus operator |
Metrics are configured to be exposed using prometheus operator API's by
default. If you want to expose metrics using the prometheus operator
API's you need to install the prometheus operator in your cluster.
### Master pod parameters
@ -109,6 +114,7 @@ We have introduced the following Chart parameters.
|-----------------------------|---------|-----------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------|
| `master.*` | dict | | NFD master deployment configuration |
| `master.port` | integer | | Specifies the TCP port that nfd-master listens for incoming requests. |
| `master.metricsPort` | integer | 8081 | Port on which to expose metrics from components to prometheus operator |
| `master.instance` | string | | Instance name. Used to separate annotation namespaces for multiple parallel deployments |
| `master.resyncPeriod` | string | | NFD API controller resync period. |
| `master.extraLabelNs` | array | [] | List of allowed extra label namespaces |
@ -139,6 +145,7 @@ We have introduced the following Chart parameters.
| Name | Type | Default | description |
| ---- | ---- | ------- | ----------- |
| `worker.*` | dict | | NFD worker daemonset configuration |
| `worker.metricsPort*` | integer | 8081 | Port on which to expose metrics from components to prometheus operator |
| `worker.config` | dict | | NFD worker [configuration](../reference/worker-configuration-reference) |
| `worker.podSecurityContext` | dict | {} | [PodSecurityContext](https://kubernetes.io/docs/tasks/configure-pod-container/security-context/#set-the-security-context-for-a-pod) holds pod-level security attributes and common container settings |
| `worker.securityContext` | dict | {} | Container [security settings](https://kubernetes.io/docs/tasks/configure-pod-container/security-context/#set-the-security-context-for-a-container) |

View file

@ -57,6 +57,8 @@ scenarios under
see [Master Worker Topologyupdater](#master-worker-topologyupdater) below
- [`topologyupdater`](https://github.com/kubernetes-sigs/node-feature-discovery/blob/{{site.release}}/deployment/overlays/topologyupdater):
see [Topology Updater](#topologyupdater) below
- [`Metrics`](https://github.com/kubernetes-sigs/node-feature-discovery/blob/{{site.release}}/deployment/overlays/prometheus):
see [Metrics](#metrics) below
- [`prune`](https://github.com/kubernetes-sigs/node-feature-discovery/blob/{{site.release}}/deployment/overlays/prune):
clean up the cluster after uninstallation, see
[Removing feature labels](uninstallation.md#removing-feature-labels)
@ -137,6 +139,17 @@ kubectl apply -k https://github.com/kubernetes-sigs/node-feature-discovery/deplo
```
### Metrics
To allow [prometheus operator][prometheus-operator]
to scrape metrics from node-feature-discovery,
run the following command:
```bash
kubectl apply -k https://github.com/kubernetes-sigs/node-feature-discovery/deployment/overlays/default?ref={{ site.release }}
kubectl apply -k https://github.com/kubernetes-sigs/node-feature-discovery/deployment/overlays/prometheus?ref={{ site.release }}
```
## Uninstallation
Simplest way is to invoke `kubectl delete` on the overlay that was used for
@ -162,3 +175,4 @@ kubectl delete clusterrolebinding nfd-master
<!-- Links -->
[kustomize]: https://github.com/kubernetes-sigs/kustomize
[prometheus-operator]: https://github.com/prometheus-operator/prometheus-operator

View file

@ -0,0 +1,43 @@
---
title: "Metrics"
layout: default
sort: 7
---
# Metrics
Metrics are configured to be exposed using [prometheus operator](https://github.com/prometheus-operator/prometheus-operator)
API's by default. If you want to expose metrics using the prometheus operator
API's you need to install the prometheus operator in your cluster.
By default NFD Master and Worker expose metrics on port 8081.
The exposed metrics are
| Metric | Type | Meaning |
| ---------------------------------- | ------- | ---------------- |
| `nfd_master_build_info` | Gauge | Version from which nfd-master was built. |
| `nfd_worker_build_info` | Gauge | Version from which nfd-worker was built. |
| `nfd_updated_nodes` | Counter | Time taken to label a node |
| `nfd_crd_processing_time` | Gauge | Time taken to process a NodeFeatureRule CRD |
| `nfd_feature_discovery_duration_seconds` | HistogramVec | Time taken to discover features on a node |
## Via Kustomize
To deploy NFD with metrics enabled using kustomize, you can use the
[Metrics Overlay](kustomize.md#metrics).
## Via Helm
By default metrics are enabled when deploying NFD via Helm. To enable Prometheus
to scrape metrics from NFD, you need to pass the following values to Helm:
```bash
--set prometheus.enable=true
```
For more info on Helm deployment, see [Helm](helm.md).
We recommend setting
`--set prometheus.prometheusSpec.podMonitorSelectorNilUsesHelmValues=false`
when deploying prometheus-operator via Helm to enable the prometheus-operator
to scrape metrics from any PodMonitor.

16
go.mod
View file

@ -15,6 +15,7 @@ require (
github.com/onsi/ginkgo/v2 v2.9.1
github.com/onsi/gomega v1.27.4
github.com/opencontainers/runc v1.1.6
github.com/prometheus/client_golang v1.15.1
github.com/smartystreets/assertions v1.2.0
github.com/smartystreets/goconvey v1.6.4
github.com/stretchr/testify v1.8.1
@ -62,7 +63,7 @@ require (
github.com/beorn7/perks v1.0.1 // indirect
github.com/blang/semver/v4 v4.0.0 // indirect
github.com/cenkalti/backoff/v4 v4.1.3 // indirect
github.com/cespare/xxhash/v2 v2.1.2 // indirect
github.com/cespare/xxhash/v2 v2.2.0 // indirect
github.com/checkpoint-restore/go-criu/v5 v5.3.0 // indirect
github.com/cilium/ebpf v0.10.0 // indirect
github.com/container-storage-interface/spec v1.7.0 // indirect
@ -111,7 +112,7 @@ require (
github.com/libopenstorage/openstorage v1.0.0 // indirect
github.com/lithammer/dedent v1.1.0 // indirect
github.com/mailru/easyjson v0.7.7 // indirect
github.com/matttproud/golang_protobuf_extensions v1.0.2 // indirect
github.com/matttproud/golang_protobuf_extensions v1.0.4 // indirect
github.com/mistifyio/go-zfs v2.1.2-0.20190413222219-f784269be439+incompatible // indirect
github.com/mitchellh/go-homedir v1.1.0 // indirect
github.com/mitchellh/mapstructure v1.4.1 // indirect
@ -129,10 +130,9 @@ require (
github.com/opencontainers/selinux v1.11.0 // indirect
github.com/pkg/errors v0.9.1 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/prometheus/client_golang v1.14.0 // indirect
github.com/prometheus/client_model v0.3.0 // indirect
github.com/prometheus/common v0.37.0 // indirect
github.com/prometheus/procfs v0.8.0 // indirect
github.com/prometheus/common v0.42.0 // indirect
github.com/prometheus/procfs v0.9.0 // indirect
github.com/rubiojr/go-vhd v0.0.0-20200706105327-02e210299021 // indirect
github.com/seccomp/libseccomp-golang v0.10.0 // indirect
github.com/sirupsen/logrus v1.9.0 // indirect
@ -161,14 +161,14 @@ require (
go.opentelemetry.io/proto/otlp v0.19.0 // indirect
go.uber.org/atomic v1.7.0 // indirect
go.uber.org/multierr v1.6.0 // indirect
go.uber.org/zap v1.19.0 // indirect
go.uber.org/zap v1.24.0 // indirect
golang.org/x/crypto v0.1.0 // indirect
golang.org/x/oauth2 v0.0.0-20220223155221-ee480838109b // indirect
golang.org/x/oauth2 v0.5.0 // indirect
golang.org/x/sync v0.1.0 // indirect
golang.org/x/sys v0.7.0 // indirect
golang.org/x/term v0.7.0 // indirect
golang.org/x/text v0.9.0 // indirect
golang.org/x/time v0.0.0-20220210224613-90d013bbcef8 // indirect
golang.org/x/time v0.3.0 // indirect
golang.org/x/tools v0.7.0 // indirect
google.golang.org/api v0.60.0 // indirect
google.golang.org/appengine v1.6.7 // indirect

29
go.sum
View file

@ -108,7 +108,6 @@ github.com/asaskevich/govalidator v0.0.0-20190424111038-f61b66f89f4a h1:idn718Q4
github.com/asaskevich/govalidator v0.0.0-20190424111038-f61b66f89f4a/go.mod h1:lB+ZfQJz7igIIfQNfa7Ml4HSf2uFQQRzpGGRXenZAgY=
github.com/aws/aws-sdk-go v1.35.24/go.mod h1:tlPOdRjfxPBpNIwqDj61rmsnA85v9jc0Ps9+muhnW+k=
github.com/benbjohnson/clock v1.1.0 h1:Q92kusRqC1XV2MjkWETPvjJVqKetz1OzxZB7mHJLju8=
github.com/benbjohnson/clock v1.1.0/go.mod h1:J11/hYXuz8f4ySSvYwY0FKfm+ezbsZBKZxNJlLklBHA=
github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q=
github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+CedLV8=
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
@ -123,8 +122,9 @@ github.com/cenkalti/backoff/v4 v4.1.3/go.mod h1:scbssz8iZGpm3xbr14ovlUdkxfGXNInq
github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU=
github.com/cespare/xxhash v1.1.0/go.mod h1:XrSqR1VqqWfGrhpAt58auRo0WTKS1nRRg3ghfAqPWnc=
github.com/cespare/xxhash/v2 v2.1.1/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
github.com/cespare/xxhash/v2 v2.1.2 h1:YRXhKfTDauu4ajMg1TPgFO5jnlC2HCbmLXMcTG5cbYE=
github.com/cespare/xxhash/v2 v2.1.2/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
github.com/cespare/xxhash/v2 v2.2.0 h1:DC2CZ1Ep5Y4k3ZQ899DldepgrayRUGE6BBZ/cd9Cj44=
github.com/cespare/xxhash/v2 v2.2.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
github.com/chai2010/gettext-go v1.0.2 h1:1Lwwip6Q2QGsAdl/ZKPCwTe9fe0CjlUbqj5bFNSjIRk=
github.com/checkpoint-restore/go-criu/v5 v5.3.0 h1:wpFFOoomK3389ue2lAb0Boag6XPht5QYpipxmSNL4d8=
github.com/checkpoint-restore/go-criu/v5 v5.3.0/go.mod h1:E/eQpaFtUKGOOSEBZgmKAcn+zUUwWxqcaKZlF54wK8E=
@ -451,8 +451,9 @@ github.com/magiconair/properties v1.8.0/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czP
github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0=
github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc=
github.com/matttproud/golang_protobuf_extensions v1.0.1/go.mod h1:D8He9yQNgCq6Z5Ld7szi9bcBfOoFv/3dc6xSMkL2PC0=
github.com/matttproud/golang_protobuf_extensions v1.0.2 h1:hAHbPm5IJGijwng3PWk09JkG9WeqChjprR5s9bBZ+OM=
github.com/matttproud/golang_protobuf_extensions v1.0.2/go.mod h1:BSXmuO+STAnVfrANrmjBb36TMTDstsz7MSK+HVaYKv4=
github.com/matttproud/golang_protobuf_extensions v1.0.4 h1:mmDVorXM7PCGKw94cs5zkfA9PSy5pEvNWRP0ET0TIVo=
github.com/matttproud/golang_protobuf_extensions v1.0.4/go.mod h1:BSXmuO+STAnVfrANrmjBb36TMTDstsz7MSK+HVaYKv4=
github.com/mistifyio/go-zfs v2.1.2-0.20190413222219-f784269be439+incompatible h1:aKW/4cBs+yK6gpqU3K/oIwk9Q/XICqd3zOX/UFuvqmk=
github.com/mistifyio/go-zfs v2.1.2-0.20190413222219-f784269be439+incompatible/go.mod h1:8AuVvqP/mXw1px98n46wfvcGfQ4ci2FwoAjKYxuo3Z4=
github.com/mitchellh/go-homedir v1.0.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0=
@ -553,8 +554,9 @@ github.com/prometheus/client_golang v1.0.0/go.mod h1:db9x61etRT2tGnBNRi70OPL5Fsn
github.com/prometheus/client_golang v1.7.1/go.mod h1:PY5Wy2awLA44sXw4AOSfFBetzPP4j5+D6mVACh+pe2M=
github.com/prometheus/client_golang v1.11.0/go.mod h1:Z6t4BnS23TR94PD6BsDNk8yVqroYurpAkEiz0P2BEV0=
github.com/prometheus/client_golang v1.12.1/go.mod h1:3Z9XVyYiZYEO+YQWt3RD2R3jrbd179Rt297l4aS6nDY=
github.com/prometheus/client_golang v1.14.0 h1:nJdhIvne2eSX/XRAFV9PcvFFRbrjbcTUj0VP62TMhnw=
github.com/prometheus/client_golang v1.14.0/go.mod h1:8vpkKitgIVNcqrRBWh1C4TIUQgYNtG/XQE4E/Zae36Y=
github.com/prometheus/client_golang v1.15.1 h1:8tXpTmJbyH5lydzFPoxSIJ0J46jdh3tylbvM1xCv0LI=
github.com/prometheus/client_golang v1.15.1/go.mod h1:e9yaBhRPU2pPNsZwE+JdQl0KEt1N9XgF6zxWmaC0xOk=
github.com/prometheus/client_model v0.0.0-20180712105110-5c3871d89910/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo=
github.com/prometheus/client_model v0.0.0-20190129233127-fd36f4220a90/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
@ -567,16 +569,18 @@ github.com/prometheus/common v0.4.1/go.mod h1:TNfzLD0ON7rHzMJeJkieUDPYmFC7Snx/y8
github.com/prometheus/common v0.10.0/go.mod h1:Tlit/dnDKsSWFlCLTWaA1cyBgKHSMdTB80sz/V91rCo=
github.com/prometheus/common v0.26.0/go.mod h1:M7rCNAaPfAosfx8veZJCuw84e35h3Cfd9VFqTh1DIvc=
github.com/prometheus/common v0.32.1/go.mod h1:vu+V0TpY+O6vW9J44gczi3Ap/oXXR10b+M/gUGO4Hls=
github.com/prometheus/common v0.37.0 h1:ccBbHCgIiT9uSoFY0vX8H3zsNR5eLt17/RQLUvn8pXE=
github.com/prometheus/common v0.37.0/go.mod h1:phzohg0JFMnBEFGxTDbfu3QyL5GI8gTQJFhYO5B3mfA=
github.com/prometheus/common v0.42.0 h1:EKsfXEYo4JpWMHH5cg+KOUWeuJSov1Id8zGR8eeI1YM=
github.com/prometheus/common v0.42.0/go.mod h1:xBwqVerjNdUDjgODMpudtOMwlOwf2SaTr1yjz4b7Zbc=
github.com/prometheus/procfs v0.0.0-20181005140218-185b4288413d/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk=
github.com/prometheus/procfs v0.0.0-20190507164030-5867b95ac084/go.mod h1:TjEm7ze935MbeOT/UhFTIMYKhuLP4wbCsTZCD3I8kEA=
github.com/prometheus/procfs v0.0.2/go.mod h1:TjEm7ze935MbeOT/UhFTIMYKhuLP4wbCsTZCD3I8kEA=
github.com/prometheus/procfs v0.1.3/go.mod h1:lV6e/gmhEcM9IjHGsFOCxxuZ+z1YqCvr4OA4YeYWdaU=
github.com/prometheus/procfs v0.6.0/go.mod h1:cz+aTbrPOrUb4q7XlbU9ygM+/jj0fzG6c1xBZuNvfVA=
github.com/prometheus/procfs v0.7.3/go.mod h1:cz+aTbrPOrUb4q7XlbU9ygM+/jj0fzG6c1xBZuNvfVA=
github.com/prometheus/procfs v0.8.0 h1:ODq8ZFEaYeCaZOJlZZdJA2AbQR98dSHSM1KW/You5mo=
github.com/prometheus/procfs v0.8.0/go.mod h1:z7EfXMXOkbkqb9IINtpCn86r/to3BnA0uaxHdg830/4=
github.com/prometheus/procfs v0.9.0 h1:wzCHvIvM5SxWqYvwgVL7yJY8Lz3PKn49KQtpgMYJfhI=
github.com/prometheus/procfs v0.9.0/go.mod h1:+pB4zwohETzFnmlpe6yd2lSc+0/46IYZRB/chUwxUZY=
github.com/prometheus/tsdb v0.7.1/go.mod h1:qhTCs0VvXwvX/y3TZrWD7rabWM+ijKTux40TwIPHuXU=
github.com/rogpeppe/fastuuid v0.0.0-20150106093220-6724a57986af/go.mod h1:XWv6SoW27p1b0cqNHllgS5HIMJraePCO15w5zCzIWYg=
github.com/rogpeppe/fastuuid v1.2.0/go.mod h1:jVj6XXZzXRy/MSR5jhDC/2q6DgLz+nrA6LYCDYWNEvQ=
@ -714,14 +718,13 @@ go.starlark.net v0.0.0-20200306205701-8dd3e2ee1dd5 h1:+FNtrFTmVw0YZGpBGX56XDee33
go.uber.org/atomic v1.4.0/go.mod h1:gD2HeocX3+yG+ygLZcrzQJaqmWj9AIm7n08wl/qW/PE=
go.uber.org/atomic v1.7.0 h1:ADUqmZGgLDDfbSL9ZmPxKTybcoEYHgpYfELNoN+7hsw=
go.uber.org/atomic v1.7.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc=
go.uber.org/goleak v1.1.10/go.mod h1:8a7PlsEVH3e/a/GLqe5IIrQx6GzcnRmZEufDUTk4A7A=
go.uber.org/goleak v1.2.1 h1:NBol2c7O1ZokfZ0LEU9K6Whx/KnwvepVetCUhtKja4A=
go.uber.org/multierr v1.1.0/go.mod h1:wR5kodmAFQ0UK8QlbwjlSNy0Z68gJhDJUG5sjR94q/0=
go.uber.org/multierr v1.6.0 h1:y6IPFStTAIT5Ytl7/XYmHvzXQ7S3g/IeZW9hyZ5thw4=
go.uber.org/multierr v1.6.0/go.mod h1:cdWPpRnG4AhwMwsgIHip0KRBQjJy5kYEpYjJxpXp9iU=
go.uber.org/zap v1.10.0/go.mod h1:vwi/ZaCAaUcBkycHslxD9B2zi4UTXhF60s6SWpuDF0Q=
go.uber.org/zap v1.19.0 h1:mZQZefskPPCMIBCSEH0v2/iUqqLrYtaeqwD6FUGUnFE=
go.uber.org/zap v1.19.0/go.mod h1:xg/QME4nWcxGxrpdeYfq7UvYrLh66cuVKdrbD1XF/NI=
go.uber.org/zap v1.24.0 h1:FiJd5l1UOLj0wCgbSE0rwwXHzEdAZS6hiiSnxJN/D60=
go.uber.org/zap v1.24.0/go.mod h1:2kMP+WWQ8aoFoedH3T2sq6iJ2yDWpHbP0f6MQbS9Gkg=
golang.org/x/crypto v0.0.0-20180904163835-0709b304e793/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20190510104115-cbcb75029529/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
@ -852,8 +855,9 @@ golang.org/x/oauth2 v0.0.0-20210805134026-6f1e6394065a/go.mod h1:KelEdhl1UZF7XfJ
golang.org/x/oauth2 v0.0.0-20210819190943-2bc19b11175f/go.mod h1:KelEdhl1UZF7XfJ4dDtk6s++YSgaE7mD/BuKKDLBl4A=
golang.org/x/oauth2 v0.0.0-20211005180243-6b3c2da341f1/go.mod h1:KelEdhl1UZF7XfJ4dDtk6s++YSgaE7mD/BuKKDLBl4A=
golang.org/x/oauth2 v0.0.0-20211104180415-d3ed0bb246c8/go.mod h1:KelEdhl1UZF7XfJ4dDtk6s++YSgaE7mD/BuKKDLBl4A=
golang.org/x/oauth2 v0.0.0-20220223155221-ee480838109b h1:clP8eMhB30EHdc0bd2Twtq6kgU7yl5ub2cQLSdrv1Dg=
golang.org/x/oauth2 v0.0.0-20220223155221-ee480838109b/go.mod h1:DAh4E804XQdzx2j+YRIaUnCqCV2RuMz24cGBJ5QYIrc=
golang.org/x/oauth2 v0.5.0 h1:HuArIo48skDwlrvM3sEdHXElYslAMsf3KwRkkW4MC4s=
golang.org/x/oauth2 v0.5.0/go.mod h1:9/XBHVqLaWO3/BRHs5jbpYCnOZVjj5V0ndyaAM7KB4I=
golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
@ -984,8 +988,9 @@ golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
golang.org/x/time v0.0.0-20220210224613-90d013bbcef8 h1:vVKdlvoWBphwdxWKrFZEuM0kGgGLxUOYcY4U/2Vjg44=
golang.org/x/time v0.0.0-20220210224613-90d013bbcef8/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
golang.org/x/time v0.3.0 h1:rg5rLMjNzMS1RkNLzCG38eapWhnYLFYXDXj2gOlr8j4=
golang.org/x/time v0.3.0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY=
@ -1003,7 +1008,6 @@ golang.org/x/tools v0.0.0-20190628153133-6cdbf07be9d0/go.mod h1:/rFqwRUd4F7ZHNgw
golang.org/x/tools v0.0.0-20190816200558-6889da9d5479/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.0.0-20190911174233-4f2ddba30aff/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.0.0-20191012152004-8de300cfc20a/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.0.0-20191108193012-7d206e10da11/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.0.0-20191113191852-77e3bb0ad9e7/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.0.0-20191115202509-3a792d9c32b2/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
@ -1235,7 +1239,6 @@ gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY=
gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ=
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gopkg.in/yaml.v3 v3.0.0-20200615113413-eeeca48fe776/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gotest.tools/v3 v3.0.2/go.mod h1:3SzNCllyD9/Y+b5r9JIKQ474KzkZyqLqEfYqMsX94Bk=

82
pkg/nfd-master/metrics.go Normal file
View file

@ -0,0 +1,82 @@
/*
Copyright 2023 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package nfdmaster
import (
"fmt"
"net/http"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
"k8s.io/klog/v2"
"sigs.k8s.io/node-feature-discovery/pkg/version"
)
// When adding metric names, see https://prometheus.io/docs/practices/naming/#metric-names
const (
buildInfoQuery = "nfd_master_build_info"
updatedNodesQuery = "nfd_updated_nodes"
crdProcessingTimeQuery = "nfd_crd_processing_time"
)
var (
srv *http.Server
buildInfo = prometheus.NewGauge(prometheus.GaugeOpts{
Name: buildInfoQuery,
Help: "Version from which Node Feature Discovery was built.",
ConstLabels: map[string]string{
"version": version.Get(),
},
})
updatedNodes = prometheus.NewCounter(prometheus.CounterOpts{
Name: updatedNodesQuery,
Help: "Number of nodes updated by the master.",
})
crdProcessingTime = prometheus.NewGauge(prometheus.GaugeOpts{
Name: crdProcessingTimeQuery,
Help: "Time spent processing the NodeFeatureRule CRD.",
})
)
// registerVersion exposes the Operator build version.
func registerVersion(version string) {
buildInfo.SetToCurrentTime()
}
// runMetricsServer starts a http server to expose metrics
func runMetricsServer(port int) {
r := prometheus.NewRegistry()
r.MustRegister(buildInfo,
updatedNodes,
crdProcessingTime)
mux := http.NewServeMux()
mux.Handle("/metrics", promhttp.HandlerFor(r, promhttp.HandlerOpts{}))
klog.InfoS("metrics server starting", "port", port)
srv = &http.Server{Addr: fmt.Sprintf(":%d", port), Handler: mux}
klog.InfoS("metrics server stopped", "exitCode", srv.ListenAndServe())
}
// stopMetricsServer stops the metrics server
func stopMetricsServer() {
if srv != nil {
klog.InfoS("stopping metrics server", "port", srv.Addr)
srv.Close()
}
}

View file

@ -114,6 +114,7 @@ type Args struct {
VerifyNodeName bool
Options string
EnableLeaderElection bool
MetricsPort int
Overrides ConfigOverrideArgs
}
@ -241,6 +242,14 @@ func (m *nfdMaster) Run() error {
return fmt.Errorf("failed to update master node: %v", err)
}
}
// Register to metrics server
if m.args.MetricsPort > 0 {
go runMetricsServer(m.args.MetricsPort)
registerVersion(version.Get())
defer stopMetricsServer()
}
// Run gRPC server
grpcErr := make(chan error, 1)
go m.runGrpcServer(grpcErr)
@ -841,6 +850,7 @@ func (m *nfdMaster) refreshNodeFeatures(cli *kubernetes.Clientset, nodeName stri
return err
}
updatedNodes.Inc()
return nil
}
@ -964,6 +974,7 @@ func (m *nfdMaster) processNodeFeatureRule(nodeName string, features *nfdv1alpha
}
// Process all rule CRs
processStart := time.Now()
for _, spec := range ruleSpecs {
switch {
case klog.V(3).Enabled():
@ -990,6 +1001,9 @@ func (m *nfdMaster) processNodeFeatureRule(nodeName string, features *nfdv1alpha
features.InsertAttributeFeatures(nfdv1alpha1.RuleBackrefDomain, nfdv1alpha1.RuleBackrefFeature, ruleOut.Vars)
}
}
processingTime := time.Since(processStart)
crdProcessingTime.Set(float64(processingTime))
klog.V(2).InfoS("processed NodeFeatureRule objects", "nodeName", nodeName, "objectCount", len(ruleSpecs), "duration", processingTime)
return labels, extendedResources, taints
}

80
pkg/nfd-worker/metrics.go Normal file
View file

@ -0,0 +1,80 @@
/*
Copyright 2023 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package nfdworker
import (
"fmt"
"net/http"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
"k8s.io/klog/v2"
"sigs.k8s.io/node-feature-discovery/pkg/version"
)
// When adding metric names, see https://prometheus.io/docs/practices/naming/#metric-names
// When adding metric names, see https://prometheus.io/docs/practices/naming/#metric-names
const (
buildInfoQuery = "nfd_worker_build_info"
featureDiscoveryDurationQuery = "nfd_feature_discovery_duration_seconds"
)
var (
srv *http.Server
featureDiscoveryDuration = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: featureDiscoveryDurationQuery,
Help: "Time taken to discover features",
},
[]string{"NodeName"},
)
buildInfo = prometheus.NewGauge(prometheus.GaugeOpts{
Name: buildInfoQuery,
Help: "Version from which Node Feature Discovery was built.",
ConstLabels: map[string]string{
"version": version.Get(),
},
})
)
// registerVersion exposes the Operator build version.
func registerVersion(version string) {
buildInfo.SetToCurrentTime()
}
// runMetricsServer starts a http server to expose metrics
func runMetricsServer(port int) {
r := prometheus.NewRegistry()
r.MustRegister(featureDiscoveryDuration)
r.MustRegister(buildInfo)
mux := http.NewServeMux()
mux.Handle("/metrics", promhttp.HandlerFor(r, promhttp.HandlerOpts{}))
klog.InfoS("metrics server starting", "port", port)
srv = &http.Server{Addr: fmt.Sprintf(":%d", port), Handler: mux}
klog.InfoS("metrics server stopped", "exit code", srv.ListenAndServe())
}
// stopMetricsServer stops the metrics server
func stopMetricsServer() {
if srv != nil {
klog.InfoS("stopping metrics server", "port", srv.Addr)
srv.Close()
}
}

View file

@ -101,6 +101,7 @@ type Args struct {
Options string
Server string
ServerNameOverride string
MetricsPort int
Overrides ConfigOverrideArgs
}
@ -197,10 +198,10 @@ func (w *nfdWorker) runFeatureDiscovery() error {
discoveryDuration := time.Since(discoveryStart)
klog.V(2).InfoS("feature discovery of all sources completed", "duration", discoveryDuration)
featureDiscoveryDuration.WithLabelValues(utils.NodeName()).Observe(discoveryDuration.Seconds())
if w.config.Core.SleepInterval.Duration > 0 && discoveryDuration > w.config.Core.SleepInterval.Duration/2 {
klog.InfoS("feature discovery sources took over half of sleep interval ", "duration", discoveryDuration, "sleepInterval", w.config.Core.SleepInterval.Duration)
}
// Get the set of feature labels.
labels := createFeatureLabels(w.labelSources, w.config.Core.LabelWhiteList.Regexp)
@ -239,6 +240,13 @@ func (w *nfdWorker) Run() error {
labelTrigger.Reset(w.config.Core.SleepInterval.Duration)
defer labelTrigger.Stop()
// Register to metrics server
if w.args.MetricsPort > 0 {
go runMetricsServer(w.args.MetricsPort)
registerVersion(version.Get())
defer stopMetricsServer()
}
err = w.runFeatureDiscovery()
if err != nil {
return err