mirror of
https://github.com/monitoring-mixins/website.git
synced 2024-12-15 17:50:48 +00:00
392 lines
14 KiB
YAML
392 lines
14 KiB
YAML
groups:
|
|
- name: promscale-general
|
|
rules:
|
|
- alert: PromscaleDown
|
|
annotations:
|
|
description: '{{ $labels.instance }} of job {{ $labels.job }} is down.'
|
|
runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleDown.md
|
|
summary: Promscale is down.
|
|
expr: absent(up{job=~".*promscale.*"})
|
|
labels:
|
|
severity: critical
|
|
- name: promscale-ingest
|
|
rules:
|
|
- alert: PromscaleIngestHighErrorRate
|
|
annotations:
|
|
description: Promscale ingestion is having a {{ $value | humanizePercentage
|
|
}} error rate.
|
|
runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleIngestHighErrorRate.md
|
|
summary: High error rate in Promscale ingestion.
|
|
expr: |
|
|
(
|
|
sum by (job, instance, namespace, type) (
|
|
rate(promscale_ingest_requests_total{code=~"5.."}[5m])
|
|
)
|
|
/
|
|
sum by (job, instance, namespace, type) (
|
|
rate(promscale_ingest_requests_total[5m])
|
|
)
|
|
) > 0.05
|
|
labels:
|
|
severity: warning
|
|
- alert: PromscaleIngestHighErrorRate
|
|
annotations:
|
|
description: Promscale ingestion is having a {{ $value | humanizePercentage
|
|
}} error rate.
|
|
runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleIngestHighErrorRate.md
|
|
summary: High error rate in Promscale ingestion.
|
|
expr: |
|
|
(
|
|
sum by (job, instance, namespace, type) (
|
|
rate(promscale_ingest_requests_total{code=~"5.."}[5m])
|
|
)
|
|
/
|
|
sum by (job, instance, namespace, type) (
|
|
rate(promscale_ingest_requests_total[5m])
|
|
)
|
|
) > 0.1
|
|
labels:
|
|
severity: critical
|
|
- alert: PromscaleIngestHighLatency
|
|
annotations:
|
|
description: Slowest 10% of ingestion batch took more than {{ $value | humanizeDuration
|
|
}} seconds to ingest.
|
|
runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleIngestHighLatency.md
|
|
summary: Slow Promscale ingestion.
|
|
expr: |
|
|
(
|
|
histogram_quantile(
|
|
0.90,
|
|
sum by (job, instance, namespace, type, le) (
|
|
rate(promscale_ingest_duration_seconds_bucket[5m])
|
|
)
|
|
) > 10
|
|
and
|
|
sum by (job, instance, namespace, type) (
|
|
rate(promscale_ingest_duration_seconds_bucket[5m])
|
|
)
|
|
) > 0
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
- alert: PromscaleIngestHighLatency
|
|
annotations:
|
|
description: Slowest 10% of ingestion batch took more than {{ $value | humanizeDuration
|
|
}} seconds to ingest.
|
|
runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleIngestHighLatency.md
|
|
summary: Slow Promscale ingestion.
|
|
expr: |
|
|
(
|
|
histogram_quantile(
|
|
0.90,
|
|
sum by (job, instance, namespace, type, le) (
|
|
rate(promscale_ingest_duration_seconds_bucket[5m])
|
|
)
|
|
) > 30
|
|
and
|
|
sum by (job, instance, namespace, type) (
|
|
rate(promscale_ingest_duration_seconds_bucket[5m])
|
|
)
|
|
) > 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
- alert: PromscaleIngestHighDataDuplication
|
|
annotations:
|
|
description: More than {{ $value | humanize }} samples/sec are rejected as duplicates
|
|
by promscale.
|
|
runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleIngestHighDataDuplication.md
|
|
summary: Duplicate data being inserted.
|
|
expr: |
|
|
sum by (job, namespace) (rate(promscale_ingest_duplicates_total{kind="sample"}[15m])) > 0
|
|
for: 30m
|
|
labels:
|
|
severity: warning
|
|
- name: promscale-query
|
|
rules:
|
|
- alert: PromscaleQueryHighErrorRate
|
|
annotations:
|
|
description: Evaluating queries via Promscale {{ $labels.handler }} endpoint
|
|
has {{ $value | humanizePercentage }} error rate.
|
|
runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleQueryHighErrorRate.md
|
|
summary: High error rate in querying Promscale.
|
|
expr: |
|
|
(
|
|
sum by (job, instance, namespace, type, handler) (
|
|
rate(promscale_query_requests_total{code=~"5..",handler!="/api/v1/query_range",err!="canceled"}[5m])
|
|
)
|
|
/
|
|
sum by (job, instance, namespace, type, handler) (
|
|
rate(promscale_query_requests_total{handler!="/api/v1/query_range",err!="canceled"}[5m])
|
|
)
|
|
) > 0.05
|
|
labels:
|
|
severity: warning
|
|
- alert: PromscaleQueryHighErrorRate
|
|
annotations:
|
|
description: Evaluating queries via Promscale {{ $labels.handler }} endpoint
|
|
has {{ $value | humanizePercentage }} error rate.
|
|
runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleQueryHighErrorRate.md
|
|
summary: High error rate in querying Promscale.
|
|
expr: |
|
|
(
|
|
sum by (job, instance, namespace, type, handler) (
|
|
rate(promscale_query_requests_total{code=~"5..",handler="/api/v1/query_range",err!="canceled"}[5m])
|
|
)
|
|
/
|
|
sum by (job, instance, namespace, type, handler) (
|
|
rate(promscale_query_requests_total{handler="/api/v1/query_range",err!="canceled"}[5m])
|
|
)
|
|
) > 0.1
|
|
labels:
|
|
severity: warning
|
|
- alert: PromscaleQueryHighErrorRate
|
|
annotations:
|
|
description: Evaluating queries via Promscale had {{ $value | humanizePercentage
|
|
}} error rate.
|
|
runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleQueryHighErrorRate.md
|
|
summary: High error rate in querying Promscale.
|
|
expr: |
|
|
(
|
|
sum by (job, instance, namespace, type) (
|
|
rate(promscale_query_requests_total{code=~"5..",err!="canceled"}[5m])
|
|
)
|
|
/
|
|
sum by (job, instance, namespace, type) (
|
|
rate(promscale_query_requests_total[5m])
|
|
)
|
|
) > 0.1
|
|
labels:
|
|
severity: critical
|
|
- alert: PromscaleQueryHighLatency
|
|
annotations:
|
|
description: Slowest 10% of the queries took more than {{ $value | humanizeDuration
|
|
}} seconds to evaluate.
|
|
runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleQueryHighLatency.md
|
|
summary: Slow Promscale querying.
|
|
expr: |
|
|
(
|
|
histogram_quantile(
|
|
0.90,
|
|
sum by (job, instance, namespace, type, le) (
|
|
rate(promscale_query_duration_seconds_bucket[5m])
|
|
)
|
|
) > 5
|
|
and
|
|
sum by (job, instance, namespace, type) (
|
|
rate(promscale_query_duration_seconds_bucket[5m])
|
|
) > 0
|
|
)
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
- alert: PromscaleQueryHighLatency
|
|
annotations:
|
|
description: Slowest 10% of the queries took {{ $value | humanizeDuration }}
|
|
seconds to evaluate.
|
|
runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleQueryHighLatency.md
|
|
summary: Slow Promscale querying.
|
|
expr: |
|
|
(
|
|
histogram_quantile(
|
|
0.90,
|
|
sum by (job, instance, namespace, type, le) (
|
|
rate(promscale_query_duration_seconds_bucket[5m])
|
|
)
|
|
) > 10
|
|
and
|
|
sum by (job, instance, namespace, type) (
|
|
rate(promscale_query_duration_seconds_bucket[5m])
|
|
) > 0
|
|
)
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
- name: promscale-cache
|
|
rules:
|
|
- alert: PromscaleCacheHighNumberOfEvictions
|
|
annotations:
|
|
description: Promscale {{ $labels.name }} is evicting at {{ $value }} entries
|
|
a second.
|
|
runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleCacheHighNumberOfEvictions.md
|
|
summary: High cache eviction in Promscale.
|
|
expr: |
|
|
(
|
|
rate(promscale_cache_evictions_total[5m])
|
|
/
|
|
promscale_cache_capacity_elements
|
|
) > 0.2
|
|
labels:
|
|
severity: warning
|
|
- name: promscale-database-connection
|
|
rules:
|
|
- alert: PromscaleStorageHighErrorRate
|
|
annotations:
|
|
description: Promscale connection with the database has an error rate of {{
|
|
$value | humanizePercentage }}.
|
|
runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleDBHighErrorRate.md
|
|
summary: Promscale experiences a high error rate when connecting to the database.
|
|
expr: |
|
|
(
|
|
sum by (job, instance, namespace, method) (
|
|
rate(promscale_database_request_errors_total[5m])
|
|
)
|
|
/
|
|
sum by (job, instance, namespace, method) (
|
|
rate(promscale_database_requests_total[5m])
|
|
)
|
|
) > 0.05
|
|
labels:
|
|
severity: warning
|
|
- alert: PromscaleStorageHighLatency
|
|
annotations:
|
|
description: Slowest 10% of database requests with method {{ $labels.method
|
|
}} are taking more than {{ $value | humanizeDuration }} seconds to respond.
|
|
runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleStorageHighLatency.md
|
|
summary: Slow database response.
|
|
expr: |
|
|
(
|
|
histogram_quantile(0.9,
|
|
sum by (job, instance, namespace, method, le) (
|
|
rate(promscale_database_requests_duration_seconds_bucket{method!="exec"}[5m])
|
|
)
|
|
) > 5
|
|
and
|
|
sum by (job, instance, namespace, method) (
|
|
rate(promscale_database_requests_duration_seconds_count{method!="exec"}[5m])
|
|
) > 0
|
|
)
|
|
labels:
|
|
severity: warning
|
|
- name: promscale-database
|
|
rules:
|
|
- alert: PromscaleStorageUnhealthy
|
|
annotations:
|
|
description: Promscale connection with the database has an error of {{ $value
|
|
| humanizePercentage }}.
|
|
runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleStorageUnhealthy.md
|
|
summary: Promscale database is unhealthy.
|
|
expr: |
|
|
(
|
|
sum by (job, instance, namespace) (
|
|
rate(promscale_sql_database_health_check_errors_total[5m])
|
|
)
|
|
/
|
|
sum by (job, instance, namespace) (
|
|
rate(promscale_sql_database_health_check_total[5m])
|
|
)
|
|
) > 0.05
|
|
labels:
|
|
severity: warning
|
|
- alert: PromscaleMaintenanceJobRunningTooLong
|
|
annotations:
|
|
description: Promscale Database is taking {{ $value | humanizeDuration }} seconds
|
|
to respond to Promscale's requests.
|
|
runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleMaintenanceJobRunningTooLong.md
|
|
summary: Promscale maintenance jobs taking too long to complete.
|
|
expr: |
|
|
(
|
|
(
|
|
(
|
|
time()
|
|
-
|
|
promscale_sql_database_worker_maintenance_job_start_timestamp_seconds
|
|
)
|
|
>
|
|
30 * 60 * 2 # 30 mins (we launch maintenance jobs scheduled at 30 mins) * 60 (to seconds) * 2 (wait max for 2 complete scans before firing alert).
|
|
)
|
|
and
|
|
promscale_sql_database_worker_maintenance_job_start_timestamp_seconds > 0
|
|
)
|
|
labels:
|
|
severity: warning
|
|
- alert: PromscaleMaintenanceJobNotKeepingup
|
|
annotations:
|
|
description: The amount of work for the promscale maintenance {{ $labels.name
|
|
}} job is not decreasing for long time.
|
|
runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleMaintenanceJobRunningTooLong.md
|
|
summary: Promscale maintenance jobs are not keeping up.
|
|
expr: |
|
|
(
|
|
(
|
|
min_over_time(promscale_sql_database_chunks_metrics_uncompressed_count[1h]) > promscale_sql_database_metric_count
|
|
)
|
|
and
|
|
(
|
|
delta(promscale_sql_database_chunks_metrics_uncompressed_count[10m]) > 0
|
|
)
|
|
)
|
|
or
|
|
(
|
|
(
|
|
min_over_time(promscale_sql_database_chunks_metrics_expired_count[1h]) > promscale_sql_database_metric_count
|
|
)
|
|
and
|
|
(
|
|
delta(promscale_sql_database_chunks_metrics_expired_count[10m]) > 0
|
|
)
|
|
)
|
|
or
|
|
(
|
|
(
|
|
min_over_time(promscale_sql_database_chunks_traces_uncompressed_count[1h]) > promscale_sql_database_metric_count
|
|
)
|
|
and
|
|
(
|
|
delta(promscale_sql_database_chunks_traces_uncompressed_count[10m]) > 0
|
|
)
|
|
)
|
|
or
|
|
(
|
|
(
|
|
min_over_time(promscale_sql_database_chunks_traces_expired_count[1h]) > promscale_sql_database_metric_count
|
|
)
|
|
and
|
|
(
|
|
delta(promscale_sql_database_chunks_traces_expired_count[10m]) > 0
|
|
)
|
|
)
|
|
labels:
|
|
severity: warning
|
|
- alert: PromscaleMaintenanceJobFailures
|
|
annotations:
|
|
description: Maintenance job for Promscale instance {{ $labels.instance }} failed
|
|
to successfully execute.
|
|
runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleMaintenanceJobFailures.md
|
|
summary: Promscale maintenance job failed.
|
|
expr: promscale_sql_database_worker_maintenance_job_failed == 1
|
|
labels:
|
|
severity: warning
|
|
- alert: PromscaleCompressionLow
|
|
annotations:
|
|
description: High uncompressed data in Promscale, on average, {{ $value }} uncompressed
|
|
chunks per metric.
|
|
runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleCompressionLow.md
|
|
summary: High uncompressed data.
|
|
expr: |
|
|
(
|
|
(
|
|
(promscale_sql_database_chunks_count - promscale_sql_database_chunks_compressed_count) # Number of uncompressed chunks.
|
|
/
|
|
promscale_sql_database_metric_count
|
|
) > 4 # If total number of average uncompressed chunk per metric is more than 4 chunks at maximum, we should alert.
|
|
and
|
|
promscale_sql_database_compression_status == 1
|
|
)
|
|
labels:
|
|
severity: warning
|
|
- alert: PromscalePostgreSQLSharedBuffersLow
|
|
annotations:
|
|
description: Currently open chunks are {{ $value | humanizePercentage }} of
|
|
PostgreSQL shared_buffers. This will impact database performance.
|
|
runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscalePostgreSQLSharedBuffersLow.md
|
|
summary: Promscale database performance will be affected.
|
|
expr: |
|
|
(
|
|
((promscale_sql_database_open_chunks_total_table_size + promscale_sql_database_open_chunks_total_index_size)
|
|
/
|
|
promscale_sql_database_shared_buffers_size)
|
|
> 1 )
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|