Skip to content

Commit

Permalink
Merge pull request #1742 from flavio-fernandes/merge-06-jul-23
Browse files Browse the repository at this point in the history
SDN-3993: [DownstreamMerge] 06 jul 23
  • Loading branch information
openshift-merge-robot committed Jul 7, 2023
2 parents 1458500 + 2f8e12e commit d917250
Show file tree
Hide file tree
Showing 28 changed files with 975 additions and 251 deletions.
26 changes: 13 additions & 13 deletions dist/templates/ovnkube-alerts.yaml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,15 @@ spec:
description: There is no running ovnkube-manager

- alert: OvnKubeManagerMultipleLeaders
expr: sum(ovnkube_master_leader) > 1
expr: sum(ovnkube_controller_leader) > 1
for: 4m
labels:
severity: critical
annotations:
description: There are multiple ovnkube-manager leaders

- alert: OvnKubeManagerNoLeader
expr: max(ovnkube_master_leader) == 0
expr: max(ovnkube_controller_leader) == 0
for: 4m
labels:
severity: critical
Expand Down Expand Up @@ -61,7 +61,7 @@ spec:
expr: |
histogram_quantile(0.99,
sum by (le) (
rate(ovnkube_master_pod_creation_latency_seconds_bucket[15m])
rate(ovnkube_controller_pod_creation_latency_seconds_bucket[15m])
)
) > 5
for: 10m
Expand All @@ -76,14 +76,14 @@ spec:
expr: |
histogram_quantile(0.99,
sum by (le) (
rate(ovnkube_master_pod_creation_latency_seconds_bucket[15m]
rate(ovnkube_controller_pod_creation_latency_seconds_bucket[15m]
offset 15m)
)
)
-
histogram_quantile(0.99,
sum by (le) (
rate(ovnkube_master_pod_creation_latency_seconds_bucket[15m])
rate(ovnkube_controller_pod_creation_latency_seconds_bucket[15m])
)
) > 5

Expand All @@ -99,7 +99,7 @@ spec:
expr: |
histogram_quantile(0.99,
sum by (le) (
rate(ovnkube_master_ovn_cli_latency_seconds_bucket{
rate(ovnkube_controller_ovn_cli_latency_seconds_bucket{
command="ovn-nbctl"}[15m])
)
) > 3
Expand All @@ -115,7 +115,7 @@ spec:
expr: |
histogram_quantile(0.99,
sum by (le) (
rate(ovnkube_master_ovn_cli_latency_seconds_bucket{
rate(ovnkube_controller_ovn_cli_latency_seconds_bucket{
command="ovn-sbctl"}[15m])
)
) > 3
Expand All @@ -131,7 +131,7 @@ spec:
expr: |
histogram_quantile(0.99,
sum by (le) (
rate(ovnkube_master_resource_update_latency_seconds_bucket{
rate(ovnkube_controller_resource_update_latency_seconds_bucket{
name="NetworkPolicy"}[15m])
)
) > 1
Expand All @@ -147,7 +147,7 @@ spec:
expr: |
histogram_quantile(0.99,
sum by (le) (
rate(ovnkube_master_resource_update_latency_seconds_bucket{
rate(ovnkube_controller_resource_update_latency_seconds_bucket{
name="Namespace"}[15m])
)
) > 1
Expand All @@ -163,7 +163,7 @@ spec:
expr: |
histogram_quantile(0.99,
sum by (le) (
rate(ovnkube_master_resource_update_latency_seconds_bucket{
rate(ovnkube_controller_resource_update_latency_seconds_bucket{
name="Service"}[15m])
)
) > 1
Expand All @@ -179,7 +179,7 @@ spec:
expr: |
histogram_quantile(0.99,
sum by (le) (
rate(ovnkube_master_resource_update_latency_seconds_bucket{
rate(ovnkube_controller_resource_update_latency_seconds_bucket{
name="Endpoint"}[15m])
)
) > 1
Expand All @@ -192,7 +192,7 @@ spec:
across all masters for the last 15minutes is more than 1 seconds.

- alert: OvnNBDBStale
expr: time() - max(ovnkube_master_nb_e2e_timestamp) > 120
expr: time() - max(ovnkube_controller_nb_e2e_timestamp) > 120
for: 4m
labels:
severity: critical
Expand All @@ -201,7 +201,7 @@ spec:
ovn-kubernetes has not written anything to the northbound database for too long

- alert: OvnSBDBStale
expr: max(ovnkube_master_nb_e2e_timestamp) - max(ovnkube_master_sb_e2e_timestamp) > 120
expr: max(ovnkube_controller_nb_e2e_timestamp) - max(ovnkube_controller_sb_e2e_timestamp) > 120
for: 4m
labels:
severity: critical
Expand Down
1 change: 1 addition & 0 deletions docs/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ Measurement accuracy can be impacted by other parallel processing that might be
This list is to help notify if there are additions, changes or removals to metrics. Latest changes are at the top of this list.

- Effect of OVN IC architecture:
- Move all the metrics from subsystem "ovnkube-master" to subsystem "ovnkube-controller". The non-IC and IC deployments will each continue to have their ovnkube-master and ovnkube-controller containers running inside the ovnkube-master and ovnkube-controller pods. The metrics scraping should work seemlessly. See https://github.com/ovn-org/ovn-kubernetes/pull/3723 for details
- Move the following metrics from subsystem "master" to subsystem "clustermanager". Therefore, the follow metrics are renamed.
- `ovnkube_master_num_v4_host_subnets` -> `ovnkube_clustermanager_num_v4_host_subnets`
- `ovnkube_master_num_v6_host_subnets` -> `ovnkube_clustermanager_num_v6_host_subnets`
Expand Down
22 changes: 11 additions & 11 deletions go-controller/cmd/ovnkube/ovnkube.go
Original file line number Diff line number Diff line change
Expand Up @@ -301,9 +301,9 @@ func startOvnKube(ctx *cli.Context, cancel context.CancelFunc) error {
return runOvnKube(ctx.Context, runMode, ovnClientset, eventRecorder)
}

// Register prometheus metrics that do not depend on becoming ovnkube-master
// leader and get the proper HA config depending on the mode. For network
// manager mode or combined cluster and network manager modes (the classic
// Register prometheus metrics that do not depend on becoming ovnkube-controller
// leader and get the proper HA config depending on the mode. For ovnkube
// controller mode or combined cluster manager and ovnkube-controller modes (the classic
// master mode), the master HA config applies. For cluster manager
// standalone mode, the cluster manager HA config applies.
var haConfig *config.HAConfig
Expand All @@ -313,7 +313,7 @@ func startOvnKube(ctx *cli.Context, cancel context.CancelFunc) error {
metrics.RegisterClusterManagerBase()
fallthrough
case runMode.ovnkubeController:
metrics.RegisterMasterBase()
metrics.RegisterOVNKubeControllerBase()
haConfig = &config.MasterHA
name = networkControllerManagerLockName()
case runMode.clusterManager:
Expand Down Expand Up @@ -483,7 +483,7 @@ func runOvnKube(ctx context.Context, runMode *ovnkubeRunMode, ovnClientset *util
defer cm.Stop()

// record delay until ready
metrics.MetricMasterReadyDuration.Set(time.Since(startTime).Seconds())
metrics.MetricOVNKubeControllerReadyDuration.Set(time.Since(startTime).Seconds())
}

if runMode.node {
Expand Down Expand Up @@ -544,22 +544,22 @@ func runOvnKube(ctx context.Context, runMode *ovnkubeRunMode, ovnClientset *util
return nil
}

type ovnkubeMasterMetrics struct {
type leaderMetrics struct {
runMode *ovnkubeRunMode
}

func (m ovnkubeMasterMetrics) On(string) {
func (m leaderMetrics) On(string) {
if m.runMode.ovnkubeController {
metrics.MetricMasterLeader.Set(1)
metrics.MetricOVNKubeControllerLeader.Set(1)
}
if m.runMode.clusterManager {
metrics.MetricClusterManagerLeader.Set(1)
}
}

func (m ovnkubeMasterMetrics) Off(string) {
func (m leaderMetrics) Off(string) {
if m.runMode.ovnkubeController {
metrics.MetricMasterLeader.Set(0)
metrics.MetricOVNKubeControllerLeader.Set(0)
}
if m.runMode.clusterManager {
metrics.MetricClusterManagerLeader.Set(0)
Expand All @@ -571,7 +571,7 @@ type ovnkubeMetricsProvider struct {
}

func (p ovnkubeMetricsProvider) NewLeaderMetric() leaderelection.SwitchMetric {
return &ovnkubeMasterMetrics{p.runMode}
return &leaderMetrics{p.runMode}
}

func networkControllerManagerLockName() string {
Expand Down
2 changes: 1 addition & 1 deletion go-controller/pkg/metrics/cluster_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ import (

var registerClusterManagerBaseMetrics sync.Once

// MetricMasterLeader identifies whether this instance of ovnkube-master is a leader or not
// MetricClusterManagerLeader identifies whether this instance of ovnkube-cluster-manager is a leader or not
var MetricClusterManagerLeader = prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: MetricOvnkubeNamespace,
Subsystem: MetricOvnkubeSubsystemClusterManager,
Expand Down
2 changes: 1 addition & 1 deletion go-controller/pkg/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ import (

const (
MetricOvnkubeNamespace = "ovnkube"
MetricOvnkubeSubsystemMaster = "master"
MetricOvnkubeSubsystemController = "controller"
MetricOvnkubeSubsystemClusterManager = "clustermanager"
MetricOvnkubeSubsystemNode = "node"
MetricOvnNamespace = "ovn"
Expand Down
Loading

0 comments on commit d917250

Please sign in to comment.