mirror of
https://github.com/dragonflydb/dragonfly.git
synced 2024-12-14 11:58:02 +00:00
chore: export pipeline related metrics (#3104)
* chore: export pipeline related metrics Export in /metrics 1. Total pipeline queue length 2. Total pipeline commands 3. Total pipelined duration Signed-off-by: Roman Gershman <roman@dragonflydb.io> --------- Signed-off-by: Roman Gershman <roman@dragonflydb.io>
This commit is contained in:
parent
137bd313ef
commit
0394387a5f
9 changed files with 169 additions and 57 deletions
4
.github/workflows/ci.yml
vendored
4
.github/workflows/ci.yml
vendored
|
@ -7,6 +7,10 @@ on:
|
|||
branches: [main]
|
||||
workflow_dispatch:
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
pre-commit:
|
||||
runs-on: ubuntu-latest
|
||||
|
|
|
@ -97,10 +97,14 @@ class ConnectionContext {
|
|||
bool async_dispatch : 1; // whether this connection is amid an async dispatch
|
||||
bool sync_dispatch : 1; // whether this connection is amid a sync dispatch
|
||||
bool journal_emulated : 1; // whether it is used to dispatch journal commands
|
||||
bool paused = false; // whether this connection is paused due to CLIENT PAUSE
|
||||
|
||||
bool paused = false; // whether this connection is paused due to CLIENT PAUSE
|
||||
// whether it's blocked on blocking commands like BLPOP, needs to be addressable
|
||||
bool blocked = false;
|
||||
|
||||
// Skip ACL validation, used by internal commands and commands run on admin port
|
||||
bool skip_acl_validation = false;
|
||||
|
||||
// How many async subscription sources are active: monitor and/or pubsub - at most 2.
|
||||
uint8_t subscriptions;
|
||||
|
||||
|
@ -108,8 +112,6 @@ class ConnectionContext {
|
|||
std::string authed_username{"default"};
|
||||
uint32_t acl_categories{dfly::acl::ALL};
|
||||
std::vector<uint64_t> acl_commands;
|
||||
// Skip ACL validation, used by internal commands and commands run on admin port
|
||||
bool skip_acl_validation = false;
|
||||
// keys
|
||||
dfly::acl::AclKeys keys{{}, true};
|
||||
|
||||
|
|
|
@ -393,19 +393,6 @@ size_t Connection::MessageHandle::UsedMemory() const {
|
|||
return sizeof(MessageHandle) + visit(MessageSize{}, this->handle);
|
||||
}
|
||||
|
||||
bool Connection::MessageHandle::IsIntrusive() const {
|
||||
return holds_alternative<AclUpdateMessagePtr>(handle) ||
|
||||
holds_alternative<CheckpointMessage>(handle);
|
||||
}
|
||||
|
||||
bool Connection::MessageHandle::IsPipelineMsg() const {
|
||||
return holds_alternative<PipelineMessagePtr>(handle);
|
||||
}
|
||||
|
||||
bool Connection::MessageHandle::IsPubMsg() const {
|
||||
return holds_alternative<PubMessagePtr>(handle);
|
||||
}
|
||||
|
||||
bool Connection::MessageHandle::IsReplying() const {
|
||||
return IsPipelineMsg() || IsPubMsg() || holds_alternative<MonitorMessage>(handle) ||
|
||||
(holds_alternative<MCPipelineMessagePtr>(handle) &&
|
||||
|
@ -751,6 +738,9 @@ std::pair<std::string, std::string> Connection::GetClientInfoBeforeAfterTid() co
|
|||
|
||||
string after;
|
||||
absl::StrAppend(&after, " irqmatch=", int(cpu == my_cpu_id));
|
||||
if (dispatch_q_.size()) {
|
||||
absl::StrAppend(&after, " pipeline=", dispatch_q_.size());
|
||||
}
|
||||
absl::StrAppend(&after, " age=", now - creation_time_, " idle=", now - last_interaction_);
|
||||
string_view phase_name = PHASE_NAMES[phase_];
|
||||
|
||||
|
@ -1272,7 +1262,7 @@ void Connection::SquashPipeline(facade::SinkReplyBuilder* builder) {
|
|||
cc_->async_dispatch = false;
|
||||
|
||||
auto it = dispatch_q_.begin();
|
||||
while (it->IsIntrusive()) // Skip all newly received intrusive messages
|
||||
while (it->IsControl()) // Skip all newly received intrusive messages
|
||||
++it;
|
||||
|
||||
for (auto rit = it; rit != it + dispatched; ++rit)
|
||||
|
@ -1291,7 +1281,7 @@ void Connection::ClearPipelinedMessages() {
|
|||
// As well as to avoid pubsub backpressure leakege.
|
||||
for (auto& msg : dispatch_q_) {
|
||||
FiberAtomicGuard guard; // don't suspend when concluding to avoid getting new messages
|
||||
if (msg.IsIntrusive())
|
||||
if (msg.IsControl())
|
||||
visit(dispatch_op, msg.handle); // to not miss checkpoints
|
||||
RecycleMessage(std::move(msg));
|
||||
}
|
||||
|
@ -1309,7 +1299,7 @@ std::string Connection::DebugInfo() const {
|
|||
absl::StrAppend(&info, "closing=", cc_->conn_closing, ", ");
|
||||
absl::StrAppend(&info, "dispatch_fiber:joinable=", dispatch_fb_.IsJoinable(), ", ");
|
||||
|
||||
bool intrusive_front = dispatch_q_.size() > 0 && dispatch_q_.front().IsIntrusive();
|
||||
bool intrusive_front = dispatch_q_.size() > 0 && dispatch_q_.front().IsControl();
|
||||
absl::StrAppend(&info, "dispatch_queue:size=", dispatch_q_.size(), ", ");
|
||||
absl::StrAppend(&info, "dispatch_queue:pipelined=", pending_pipeline_cmd_cnt_, ", ");
|
||||
absl::StrAppend(&info, "dispatch_queue:intrusive=", intrusive_front, ", ");
|
||||
|
@ -1549,7 +1539,7 @@ void Connection::SendAsync(MessageHandle msg) {
|
|||
|
||||
// "Closing" connections might be still processing commands, as we don't interrupt them.
|
||||
// So we still want to deliver control messages to them (like checkpoints).
|
||||
if (cc_->conn_closing && !msg.IsIntrusive())
|
||||
if (cc_->conn_closing && !msg.IsControl())
|
||||
return;
|
||||
|
||||
// If we launch while closing, it won't be awaited. Control messages will be processed on cleanup.
|
||||
|
@ -1573,9 +1563,9 @@ void Connection::SendAsync(MessageHandle msg) {
|
|||
pending_pipeline_cmd_cnt_++;
|
||||
}
|
||||
|
||||
if (msg.IsIntrusive()) {
|
||||
if (msg.IsControl()) {
|
||||
auto it = dispatch_q_.begin();
|
||||
while (it < dispatch_q_.end() && it->IsIntrusive())
|
||||
while (it < dispatch_q_.end() && it->IsControl())
|
||||
++it;
|
||||
dispatch_q_.insert(it, std::move(msg));
|
||||
} else {
|
||||
|
|
|
@ -147,12 +147,21 @@ class Connection : public util::Connection {
|
|||
struct MessageHandle {
|
||||
size_t UsedMemory() const; // How much bytes this handle takes up in total.
|
||||
|
||||
// Intrusive messages put themselves at the front of the queue, but only after all other
|
||||
// intrusive ones. Used for quick transfer of control / update messages.
|
||||
bool IsIntrusive() const;
|
||||
// Control messages put themselves at the front of the queue, but only after all other
|
||||
// control ones. Used for management messages.
|
||||
bool IsControl() const {
|
||||
return std::holds_alternative<AclUpdateMessagePtr>(handle) ||
|
||||
std::holds_alternative<CheckpointMessage>(handle);
|
||||
}
|
||||
|
||||
bool IsPipelineMsg() const {
|
||||
return std::holds_alternative<PipelineMessagePtr>(handle);
|
||||
}
|
||||
|
||||
bool IsPubMsg() const {
|
||||
return std::holds_alternative<PubMessagePtr>(handle);
|
||||
}
|
||||
|
||||
bool IsPipelineMsg() const;
|
||||
bool IsPubMsg() const;
|
||||
bool IsReplying() const; // control messges don't reply, messages carrying data do
|
||||
|
||||
std::variant<MonitorMessage, PubMessagePtr, PipelineMessagePtr, MCPipelineMessagePtr,
|
||||
|
|
|
@ -48,13 +48,13 @@ inline std::string_view ToSV(std::string_view slice) {
|
|||
|
||||
struct ConnectionStats {
|
||||
size_t read_buf_capacity = 0; // total capacity of input buffers
|
||||
size_t dispatch_queue_entries = 0; // total number of dispatch queue entries
|
||||
uint64_t dispatch_queue_entries = 0; // total number of dispatch queue entries
|
||||
size_t dispatch_queue_bytes = 0; // total size of all dispatch queue entries
|
||||
size_t dispatch_queue_subscriber_bytes = 0; // total size of all publish messages
|
||||
|
||||
size_t pipeline_cmd_cache_bytes = 0;
|
||||
|
||||
size_t io_read_cnt = 0;
|
||||
uint64_t io_read_cnt = 0;
|
||||
size_t io_read_bytes = 0;
|
||||
|
||||
uint64_t command_cnt = 0;
|
||||
|
|
|
@ -1071,10 +1071,17 @@ void PrintPrometheusMetrics(const Metrics& m, DflyCmd* dfly_cmd, StringResponse*
|
|||
MetricType::GAUGE, &resp->body());
|
||||
AppendMetricWithoutLabels("blocked_clients", "", conn_stats.num_blocked_clients,
|
||||
MetricType::GAUGE, &resp->body());
|
||||
AppendMetricWithoutLabels("dispatch_queue_bytes", "", conn_stats.dispatch_queue_bytes,
|
||||
AppendMetricWithoutLabels("pipeline_queue_bytes", "", conn_stats.dispatch_queue_bytes,
|
||||
MetricType::GAUGE, &resp->body());
|
||||
AppendMetricWithoutLabels("pipeline_queue_length", "", conn_stats.dispatch_queue_entries,
|
||||
MetricType::GAUGE, &resp->body());
|
||||
AppendMetricWithoutLabels("pipeline_cmd_cache_bytes", "", conn_stats.pipeline_cmd_cache_bytes,
|
||||
MetricType::GAUGE, &resp->body());
|
||||
AppendMetricWithoutLabels("pipeline_commands_total", "", conn_stats.pipelined_cmd_cnt,
|
||||
MetricType::COUNTER, &resp->body());
|
||||
AppendMetricWithoutLabels("pipeline_commands_duration_seconds", "",
|
||||
conn_stats.pipelined_cmd_latency * 1e-6, MetricType::COUNTER,
|
||||
&resp->body());
|
||||
|
||||
// Memory metrics
|
||||
auto sdata_res = io::ReadStatusInfo();
|
||||
|
@ -1977,7 +1984,7 @@ void ServerFamily::Info(CmdArgList args, ConnectionContext* cntx) {
|
|||
append("max_clients", GetFlag(FLAGS_maxclients));
|
||||
append("client_read_buffer_bytes", m.facade_stats.conn_stats.read_buf_capacity);
|
||||
append("blocked_clients", m.facade_stats.conn_stats.num_blocked_clients);
|
||||
append("dispatch_queue_entries", m.facade_stats.conn_stats.dispatch_queue_entries);
|
||||
append("pipeline_queue_length", m.facade_stats.conn_stats.dispatch_queue_entries);
|
||||
}
|
||||
|
||||
if (should_enter("MEMORY")) {
|
||||
|
|
|
@ -6,7 +6,7 @@ volumes:
|
|||
|
||||
services:
|
||||
prometheus:
|
||||
image: prom/prometheus
|
||||
image: prom/prometheus:v2.45.5
|
||||
restart: always
|
||||
volumes:
|
||||
- ./prometheus:/etc/prometheus/
|
||||
|
|
|
@ -105,7 +105,7 @@
|
|||
},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "9.3.6",
|
||||
"pluginVersion": "10.1.10",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
|
@ -191,7 +191,7 @@
|
|||
},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "9.3.6",
|
||||
"pluginVersion": "10.1.10",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
|
@ -282,7 +282,7 @@
|
|||
"showThresholdLabels": false,
|
||||
"showThresholdMarkers": true
|
||||
},
|
||||
"pluginVersion": "9.3.6",
|
||||
"pluginVersion": "10.1.10",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
|
@ -1609,10 +1609,24 @@
|
|||
"fullMetaSearch": false,
|
||||
"includeNullMetadata": false,
|
||||
"instant": false,
|
||||
"legendFormat": "__auto",
|
||||
"legendFormat": "switch",
|
||||
"range": true,
|
||||
"refId": "A",
|
||||
"useBackend": false
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr":
|
||||
"rate(dragonfly_fiber_longrun_seconds[$__rate_interval])/rate(dragonfly_fiber_longrun_total[$__rate_interval])",
|
||||
"hide": false,
|
||||
"instant": false,
|
||||
"legendFormat": "longrun",
|
||||
"range": true,
|
||||
"refId": "B"
|
||||
}
|
||||
],
|
||||
"title": "FiberSwitchDelay",
|
||||
|
@ -1622,7 +1636,7 @@
|
|||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
"uid": "PBFA97CFB590B2093"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
|
@ -1673,8 +1687,7 @@
|
|||
"value": 80
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "s"
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
|
@ -1684,7 +1697,7 @@
|
|||
"x": 12,
|
||||
"y": 30
|
||||
},
|
||||
"id": 20,
|
||||
"id": 22,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [],
|
||||
|
@ -1703,21 +1716,15 @@
|
|||
"type": "prometheus",
|
||||
"uid": "PBFA97CFB590B2093"
|
||||
},
|
||||
"disableTextWrap": false,
|
||||
"editorMode": "code",
|
||||
"expr":
|
||||
"rate(dragonfly_fiber_longrun_seconds_total[$__rate_interval])/rate(dragonfly_fiber_longrun_total[$__rate_interval])",
|
||||
"fullMetaSearch": false,
|
||||
"includeNullMetadata": false,
|
||||
"expr": "dragonfly_pipeline_queue_length/dragonfly_connected_clients",
|
||||
"instant": false,
|
||||
"legendFormat": "__auto",
|
||||
"legendFormat": "avr_pipeline_depth",
|
||||
"range": true,
|
||||
"refId": "A",
|
||||
"useBackend": false
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "FiberSwitchDelay",
|
||||
"transformations": [],
|
||||
"title": "Pipeline length",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
|
@ -1826,6 +1833,102 @@
|
|||
],
|
||||
"title": "Master Replication memory",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "PBFA97CFB590B2093"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 0,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": {
|
||||
"legend": false,
|
||||
"tooltip": false,
|
||||
"viz": false
|
||||
},
|
||||
"insertNulls": false,
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": {
|
||||
"type": "linear"
|
||||
},
|
||||
"showPoints": "auto",
|
||||
"spanNulls": false,
|
||||
"stacking": {
|
||||
"group": "A",
|
||||
"mode": "none"
|
||||
},
|
||||
"thresholdsStyle": {
|
||||
"mode": "off"
|
||||
}
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 80
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "s"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 38
|
||||
},
|
||||
"id": 23,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [],
|
||||
"displayMode": "list",
|
||||
"placement": "bottom",
|
||||
"showLegend": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "single",
|
||||
"sort": "none"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "PBFA97CFB590B2093"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr":
|
||||
"rate(dragonfly_pipeline_commands_duration_seconds[$__rate_interval])/rate(dragonfly_pipeline_commands_total[$__rate_interval])",
|
||||
"instant": false,
|
||||
"legendFormat": "__auto",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Pipeline latency",
|
||||
"type": "timeseries"
|
||||
}
|
||||
],
|
||||
"refresh": "1m",
|
||||
|
@ -1948,7 +2051,7 @@
|
|||
]
|
||||
},
|
||||
"time": {
|
||||
"from": "now-5m",
|
||||
"from": "now-15m",
|
||||
"to": "now"
|
||||
},
|
||||
"timepicker": {
|
||||
|
@ -1979,6 +2082,6 @@
|
|||
"timezone": "browser",
|
||||
"title": "Dragonfly Dashboard",
|
||||
"uid": "xDLNRKUWz",
|
||||
"version": 1,
|
||||
"version": 4,
|
||||
"weekStart": ""
|
||||
}
|
||||
|
|
|
@ -1,8 +1,7 @@
|
|||
# my global config
|
||||
global:
|
||||
scrape_interval: 15s # By default, scrape targets every 15 seconds.
|
||||
evaluation_interval: 15s # By default, scrape targets every 15 seconds.
|
||||
# scrape_timeout is set to the global default (10s).
|
||||
scrape_interval: 5s
|
||||
evaluation_interval: 5s
|
||||
|
||||
# Attach these labels to any time series or alerts when communicating with
|
||||
# external systems (federation, remote storage, Alertmanager).
|
||||
|
@ -43,8 +42,6 @@ scrape_configs:
|
|||
|
||||
|
||||
- job_name: 'node-exporter'
|
||||
|
||||
# Override the global default and scrape targets from this job every 5 seconds.
|
||||
scrape_interval: 1s
|
||||
static_configs:
|
||||
- targets: ['node-exporter:9100']
|
||||
|
|
Loading…
Reference in a new issue