From 718513a1cb0fa503d185567fe02249bbb6d2ddb7 Mon Sep 17 00:00:00 2001 From: Will Lewis <1543626+wrjlewis@users.noreply.github.com> Date: Wed, 1 May 2024 16:13:13 +0100 Subject: [PATCH 1/2] Add grafana chart for SS proxy --- grafana/README.md | 18 + grafana/sliding-sync.json | 2120 +++++++++++++++++++++++++++++++++++++ 2 files changed, 2138 insertions(+) create mode 100644 grafana/README.md create mode 100644 grafana/sliding-sync.json diff --git a/grafana/README.md b/grafana/README.md new file mode 100644 index 00000000..3981001a --- /dev/null +++ b/grafana/README.md @@ -0,0 +1,18 @@ +# Using the Sliding Sync Grafana dashboard + +**Set up Prometheus and Grafana.** Out of scope for this readme. Useful documentation about using Grafana with Prometheus: http://docs.grafana.org/features/datasources/prometheus/ + +**Configure the bind address** for prometheus metrics in sliding sync. For example: `SYNCV3_PROM=2112`. Metrics will be accessible at /metrics at this address. + +**Configure a new job in Prometheus** to scrape the sliding sync endpoint. + +For example by adding the following job to `prometheus.yml`, if your sliding sync is running locally and you have `SYNCV3_PROM` configured to port 2112: + +``` +# Sliding Sync + - job_name: "Sliding Sync" + static_configs: + - targets: ["localhost:2112"] +``` + +**Import the sliding sync dashboard into Grafana.** Download `sliding-sync.json`. Import it to Grafana and select the correct Prometheus datasource. http://docs.grafana.org/reference/export_import/ \ No newline at end of file diff --git a/grafana/sliding-sync.json b/grafana/sliding-sync.json new file mode 100644 index 00000000..639f9144 --- /dev/null +++ b/grafana/sliding-sync.json @@ -0,0 +1,2120 @@ +{ + "__inputs": [], + "__elements": {}, + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "10.4.1" + }, + { + "type": "panel", + "id": "heatmap", + "name": "Heatmap", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "timeseries", + "name": "Time series", + "version": "" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 14, + "panels": [], + "title": "Sliding Sync API", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "Actively syncing clients i.e the connection hasn't expired yet. ", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "conns" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Expired Connections (full buffer)" + }, + "properties": [ + { + "id": "custom.axisPlacement", + "value": "right" + }, + { + "id": "custom.lineStyle", + "value": { + "dash": [ + 0, + 10 + ], + "fill": "dot" + } + }, + { + "id": "color", + "value": { + "fixedColor": "purple", + "mode": "fixed" + } + }, + { + "id": "custom.showPoints", + "value": "always" + }, + { + "id": "custom.lineWidth", + "value": 0 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Expired Connections (timed out)" + }, + "properties": [ + { + "id": "custom.axisPlacement", + "value": "right" + }, + { + "id": "color", + "value": { + "fixedColor": "blue", + "mode": "fixed" + } + }, + { + "id": "custom.showPoints", + "value": "always" + }, + { + "id": "custom.fillOpacity", + "value": 0 + }, + { + "id": "custom.lineWidth", + "value": 0 + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 1 + }, + "id": 6, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.5.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "disableTextWrap": false, + "editorMode": "builder", + "exemplar": false, + "expr": "sum(sliding_sync_api_num_active_conns)", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "Active Connections", + "range": true, + "refId": "A", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "increase(sliding_sync_api_expiry_conn_buffer_full[5m])", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": false, + "legendFormat": "Expired Connections (full buffer)", + "range": true, + "refId": "B", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "increase(sliding_sync_api_expiry_conn_timed_out[5m])", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "legendFormat": "Expired Connections (timed out)", + "range": true, + "refId": "C", + "useBackend": false + } + ], + "title": "# active sliding sync connections", + "transformations": [ + { + "id": "renameByRegex", + "options": { + "regex": "sliding-sync-api-(.*)", + "renamePattern": "$1" + } + } + ], + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The number of devices blocked on an initial v2 sync. This number should never remain >0 for more than 15 minutes, unless there is a flood of new users to the system.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "conns" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 1 + }, + "id": 166, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.5.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "disableTextWrap": false, + "editorMode": "builder", + "exemplar": false, + "expr": "sliding_sync_api_num_devices_pending_ensure_polling", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "EnsurePolling calls outstanding", + "transformations": [ + { + "id": "renameByRegex", + "options": { + "regex": "sliding-sync-api-(.*)", + "renamePattern": "$1" + } + } + ], + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "Excludes time waiting for live updates. Excludes initial requests.", + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "red", + "mode": "palette-classic", + "seriesBy": "max" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 0, + "pointSize": 4, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "99%" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-red", + "mode": "fixed" + } + }, + { + "id": "custom.fillOpacity", + "value": 35 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "95%" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "50%" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "25%" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "semi-dark-blue", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "75%" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "yellow", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Request rate" + }, + "properties": [ + { + "id": "custom.axisPlacement", + "value": "right" + }, + { + "id": "unit", + "value": "hertz" + }, + { + "id": "custom.fillOpacity", + "value": 0 + }, + { + "id": "color", + "value": { + "fixedColor": "purple", + "mode": "fixed" + } + }, + { + "id": "custom.showPoints", + "value": "always" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "90%" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "super-light-orange", + "mode": "fixed" + } + } + ] + }, + { + "__systemRef": "hideSeriesFrom", + "matcher": { + "id": "byNames", + "options": { + "mode": "exclude", + "names": [ + "Request rate" + ], + "prefix": "All except:", + "readOnly": true + } + }, + "properties": [ + { + "id": "custom.hideFrom", + "value": { + "legend": false, + "tooltip": false, + "viz": true + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 9 + }, + "id": 152, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "9.2.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.99, sum by(le) (rate(sliding_sync_api_process_duration_secs_bucket{initial=\"0\"}[$window_size])))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "99%", + "range": true, + "refId": "99" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.90, sum by(le) (rate(sliding_sync_api_process_duration_secs_bucket{initial=\"0\"}[$window_size])))", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "90%", + "range": true, + "refId": "90" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.75, sum by(le) (rate(sliding_sync_api_process_duration_secs_bucket{initial=\"0\"}[$window_size])))", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "75%", + "range": true, + "refId": "75" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.5, sum by(le) (rate(sliding_sync_api_process_duration_secs_bucket{initial=\"0\"}[$window_size])))", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "50%", + "range": true, + "refId": "50" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.25, sum by(le) (rate(sliding_sync_api_process_duration_secs_bucket{initial=\"0\"}[$window_size])))", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "25%", + "range": true, + "refId": "25" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(rate(sliding_sync_api_process_duration_secs_count{initial=\"0\"}[$window_size]))", + "hide": false, + "instant": false, + "legendFormat": "Request rate", + "range": true, + "refId": "A" + } + ], + "title": "API processing time quantiles", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 9 + }, + "id": 157, + "options": { + "calculate": false, + "cellGap": 1, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Oranges", + "steps": 64 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": true + }, + "yAxis": { + "axisPlacement": "left", + "reverse": false + } + }, + "pluginVersion": "10.4.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "editorMode": "builder", + "expr": "sum(rate(sliding_sync_api_setup_duration_secs_bucket{}[$__rate_interval])) by (le)", + "format": "heatmap", + "legendFormat": "{{le}}", + "range": true, + "refId": "A" + } + ], + "title": "Request setup times", + "type": "heatmap" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "Requests take more that 50s. This is our best proxy for \"how many people's requests are wedged\". Should be a flat 0.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "line" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 17 + }, + "id": 159, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "sum(increase(sliding_sync_api_slow_requests[5m]))", + "fullMetaSearch": false, + "includeNullMetadata": true, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Slow requests", + "type": "timeseries" + }, + { + "cards": {}, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateOranges", + "exponent": 0.5, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "If this goes high, it may indicate poor DB performance when querying.", + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 17 + }, + "heatmap": {}, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 10, + "legend": { + "show": true + }, + "options": { + "calculate": false, + "calculation": {}, + "cellGap": 2, + "cellValues": {}, + "color": { + "exponent": 0.5, + "fill": "#b4ff00", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Oranges", + "steps": 128 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "showValue": "never", + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": true + }, + "yAxis": { + "axisPlacement": "left", + "reverse": false, + "unit": "dtdurations" + } + }, + "pluginVersion": "10.4.1", + "reverseYBuckets": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "sum(increase(sliding_sync_api_process_duration_secs_bucket{initial=\"1\"}[$__rate_interval])) by (le)", + "format": "heatmap", + "legendFormat": "{{le}}", + "range": true, + "refId": "A" + } + ], + "title": "Time taken to process initial sliding sync requests", + "tooltip": { + "show": true, + "showHistogram": true + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "yAxis": { + "format": "dtdurations", + "logBase": 1, + "show": true + }, + "yBucketBound": "auto" + }, + { + "cards": {}, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateOranges", + "exponent": 0.5, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "Excludes live streaming blocks. If this goes high, it may indicate poor DB performance when querying.", + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 25 + }, + "heatmap": {}, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 15, + "legend": { + "show": true + }, + "options": { + "calculate": false, + "calculation": {}, + "cellGap": 2, + "cellValues": { + "decimals": 3 + }, + "color": { + "exponent": 0.5, + "fill": "#b4ff00", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Oranges", + "steps": 128 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "showValue": "never", + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": false + }, + "yAxis": { + "axisPlacement": "left", + "min": "0", + "reverse": false, + "unit": "dtdurations" + } + }, + "pluginVersion": "10.4.1", + "reverseYBuckets": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "sum(rate(sliding_sync_api_process_duration_secs_bucket{initial=\"0\"}[$__rate_interval])) by (le)", + "format": "heatmap", + "interval": "", + "legendFormat": "{{le}}", + "range": true, + "refId": "A" + } + ], + "title": "Time taken to process changes (ranges/filter/sorting/etc) in sliding sync requests", + "tooltip": { + "show": true, + "showHistogram": false + }, + "tooltipDecimals": 3, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketSize": "", + "yAxis": { + "format": "dtdurations", + "logBase": 1, + "min": "0", + "show": true + }, + "yBucketBound": "auto" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 33 + }, + "id": 12, + "panels": [], + "title": "V2 Poller", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "Number of /sync connections to upstream homeserver. Generally always goes up, unless users log out and invalidate the access_token being used.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "pollers", + "unitScale": true + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 34 + }, + "id": 4, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.5.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sliding_sync_poller_num_pollers{}", + "instant": false, + "legendFormat": "{{pod}}", + "range": true, + "refId": "A" + } + ], + "title": "# v2 pollers", + "type": "timeseries" + }, + { + "cards": {}, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateOranges", + "exponent": 0.5, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The higher this is, the bigger the latency from sending events -> receiving events. Excludes initial /sync requests", + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + }, + "unitScale": true + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 34 + }, + "heatmap": {}, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 8, + "legend": { + "show": false + }, + "options": { + "calculate": false, + "calculation": {}, + "cellGap": 2, + "cellValues": {}, + "color": { + "exponent": 0.5, + "fill": "#b4ff00", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Oranges", + "steps": 128 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": false + }, + "rowsFrame": { + "layout": "auto" + }, + "showValue": "never", + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": false + }, + "yAxis": { + "axisPlacement": "left", + "reverse": false, + "unit": "dtdurations" + } + }, + "pluginVersion": "10.3.3", + "reverseYBuckets": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "sum(increase(sliding_sync_poller_process_duration_secs_bucket{initial=\"0\"}[$__rate_interval])) by (le)", + "format": "heatmap", + "legendFormat": "{{le}}", + "range": true, + "refId": "A" + } + ], + "title": "Time taken to process sync v2 responses", + "tooltip": { + "show": true, + "showHistogram": false + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "yAxis": { + "format": "dtdurations", + "logBase": 1, + "show": true + }, + "yBucketBound": "auto" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "If this drops too low, this means pollers are blocked on something (DB conns, executor, etc)", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "area" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red" + }, + { + "color": "green", + "value": 0.75 + } + ] + }, + "unit": "percentunit", + "unitScale": true + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 42 + }, + "id": 165, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "sliding_sync_poller_num_outstanding_sync_v2_reqs{}/sliding_sync_poller_num_pollers{}", + "legendFormat": "{{pod}}", + "range": true, + "refId": "A" + } + ], + "title": "Ratio of pollers waiting for v2 response", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "If this number drops to 0, this indicates something is blocking all pollers from doing work. This number will scale with the number of pollers on the process.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unitScale": true + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 42 + }, + "id": 167, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "rate(sliding_sync_poller_total_num_polls{}[5m])", + "legendFormat": "{{pod}}", + "range": true, + "refId": "A" + } + ], + "title": "Rate of poll loop iterations", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "NB: Proxy requests a timeline limit of 50.\n\nTODO: make it clear how many syncs were limited", + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + }, + "unitScale": true + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 50 + }, + "id": 16, + "options": { + "calculate": false, + "cellGap": 1, + "cellValues": {}, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "min": 0, + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Oranges", + "steps": 64 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto", + "value": "v2 syncs" + }, + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": true + }, + "yAxis": { + "axisLabel": "timeline events", + "axisPlacement": "left", + "max": "50", + "reverse": false + } + }, + "pluginVersion": "10.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "sum(rate(sliding_sync_poller_timeline_size_bucket{limited=\"unlimited\"}[$__rate_interval])) by (le)", + "format": "heatmap", + "legendFormat": "{{limited}}", + "range": true, + "refId": "A" + } + ], + "title": "Timeline size of unlimited pollers", + "type": "heatmap" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "Updates from v2 pollers sent to pubsub. Abnormal spikes could be from spam or lack of duplicate suppression in the proxy.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unitScale": true + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 50 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "sum by (payload_type) (rate(sliding_sync_poller_num_payloads{}[$window_size]))", + "legendFormat": "{{payload_type}}", + "range": true, + "refId": "A" + } + ], + "title": "Payload Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "Excludes time waiting for v2 sync requests. Excludes initial polls.", + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "red", + "mode": "palette-classic", + "seriesBy": "max" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 0, + "pointSize": 4, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "s", + "unitScale": true + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "99%" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-red", + "mode": "fixed" + } + }, + { + "id": "custom.fillOpacity", + "value": 35 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "95%" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "50%" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "25%" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "semi-dark-blue", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "75%" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "yellow", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Poll rate" + }, + "properties": [ + { + "id": "custom.axisPlacement", + "value": "right" + }, + { + "id": "unit", + "value": "hertz" + }, + { + "id": "custom.fillOpacity", + "value": 0 + }, + { + "id": "color", + "value": { + "fixedColor": "purple", + "mode": "fixed" + } + }, + { + "id": "custom.showPoints", + "value": "always" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "90%" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "super-light-orange", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 59 + }, + "id": 153, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "9.2.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.99, sum by(le) (rate(sliding_sync_poller_process_duration_secs_bucket{initial=\"0\"}[$window_size])))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "99%", + "range": true, + "refId": "99" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.90, sum by(le) (rate(sliding_sync_poller_process_duration_secs_bucket{initial=\"0\"}[$window_size])))", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "90%", + "range": true, + "refId": "90" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.75, sum by(le) (rate(sliding_sync_poller_process_duration_secs_bucket{initial=\"0\"}[$window_size])))", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "75%", + "range": true, + "refId": "75" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.5, sum by(le) (rate(sliding_sync_poller_process_duration_secs_bucket{initial=\"0\"}[$window_size])))", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "50%", + "range": true, + "refId": "50" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.25, sum by(le) (rate(sliding_sync_poller_process_duration_secs_bucket{initial=\"0\"}[$window_size])))", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "25%", + "range": true, + "refId": "25" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "sum(rate(sliding_sync_poller_process_duration_secs_count{initial=\"0\"}[$window_size]))", + "hide": false, + "legendFormat": "Poll rate", + "range": true, + "refId": "A" + } + ], + "title": "Poller processing time quantiles", + "type": "timeseries" + } + ], + "refresh": "", + "schemaVersion": 39, + "tags": [], + "templating": { + "list": [ + { + "auto": true, + "auto_count": 100, + "auto_min": "30s", + "current": { + "selected": true, + "text": "auto", + "value": "$__auto_interval_window_size" + }, + "description": "Window to use for aggregating buckets/moving averages", + "hide": 0, + "label": "Window size", + "name": "window_size", + "options": [ + { + "selected": true, + "text": "auto", + "value": "$__auto_interval_window_size" + }, + { + "selected": false, + "text": "30s", + "value": "30s" + }, + { + "selected": false, + "text": "1m", + "value": "1m" + }, + { + "selected": false, + "text": "2m", + "value": "2m" + }, + { + "selected": false, + "text": "5m", + "value": "5m" + }, + { + "selected": false, + "text": "10m", + "value": "10m" + }, + { + "selected": false, + "text": "15m", + "value": "15m" + }, + { + "selected": false, + "text": "30m", + "value": "30m" + }, + { + "selected": false, + "text": "1h", + "value": "1h" + }, + { + "selected": false, + "text": "2h", + "value": "2h" + } + ], + "query": "30s,1m,2m,5m,10m,15m,30m,1h,2h", + "queryValue": "", + "refresh": 2, + "skipUrlSync": false, + "type": "interval" + }, + { + "current": { + "selected": false, + "text": "Prometheus", + "value": "c433c715-0878-4d85-877c-465ce5b8cac4" + }, + "hide": 0, + "includeAll": false, + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "queryValue": "", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Sliding Sync", + "uid": "slidingsync", + "version": 27, + "weekStart": "" + } \ No newline at end of file From 62b181c3b6da8f2a0c5af3c593207d13153d2e81 Mon Sep 17 00:00:00 2001 From: Will Lewis <1543626+wrjlewis@users.noreply.github.com> Date: Thu, 16 May 2024 14:10:31 +0100 Subject: [PATCH 2/2] Apply suggestions from code review Co-authored-by: Kegan Dougal <7190048+kegsay@users.noreply.github.com> --- grafana/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/grafana/README.md b/grafana/README.md index 3981001a..e8d2103f 100644 --- a/grafana/README.md +++ b/grafana/README.md @@ -2,11 +2,11 @@ **Set up Prometheus and Grafana.** Out of scope for this readme. Useful documentation about using Grafana with Prometheus: http://docs.grafana.org/features/datasources/prometheus/ -**Configure the bind address** for prometheus metrics in sliding sync. For example: `SYNCV3_PROM=2112`. Metrics will be accessible at /metrics at this address. +**Configure the bind address** for prometheus metrics in sliding sync. For example: `SYNCV3_PROM=:2112`. Metrics will be accessible at `/metrics` at this address. **Configure a new job in Prometheus** to scrape the sliding sync endpoint. -For example by adding the following job to `prometheus.yml`, if your sliding sync is running locally and you have `SYNCV3_PROM` configured to port 2112: +For example by adding the following job to `prometheus.yml`, if your sliding sync is running locally not in docker and you have `SYNCV3_PROM` configured to port 2112: ``` # Sliding Sync