2026-03-20 18:24:51 +08:00
# Default Prometheus alert groups for OpenIM.
2024-07-16 10:46:21 +08:00
groups :
2026-03-20 18:24:51 +08:00
- name : instance_down # Fires when a monitored target remains unreachable.
2024-07-16 10:46:21 +08:00
rules :
- alert : InstanceDown
2026-03-20 18:24:51 +08:00
expr : up == 0 # The built-in "up" metric is 0 when the latest scrape fails.
for : 1m # Trigger only if the condition remains true for more than 1 minute.
2024-07-16 10:46:21 +08:00
labels :
2026-03-20 18:24:51 +08:00
severity : critical # Used by Alertmanager for routing and notification priority.
2024-07-16 10:46:21 +08:00
annotations :
summary : "Instance {{ $labels.instance }} down"
2026-03-20 18:24:51 +08:00
description : "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minute."
2024-07-16 10:46:21 +08:00
2026-03-20 18:24:51 +08:00
- name : database_insert_failure_alerts # Detects failures when persisting messages to Redis or MongoDB.
2024-07-16 10:46:21 +08:00
rules :
- alert : DatabaseInsertFailed
2026-03-20 18:24:51 +08:00
expr : (increase(msg_insert_redis_failed_total[5m]) > 0) or (increase(msg_insert_mongo_failed_total[5m]) > 0) # Any positive increase indicates write failures occurred in the last 5 minutes.
for : 1m # Avoid firing on very short spikes.
2024-07-16 10:46:21 +08:00
labels :
severity : critical
annotations :
summary : "Increase in MsgInsertRedisFailedCounter or MsgInsertMongoFailedCounter detected"
2026-03-20 18:24:51 +08:00
description : "Either MsgInsertRedisFailedCounter or MsgInsertMongoFailedCounter increased in the last 5 minutes, indicating message insert failures to Redis or MongoDB and a possible backend outage."
2024-07-19 16:08:39 +08:00
2026-03-20 18:24:51 +08:00
- name : registrations_few # Operational early-warning rule for unusually low login/registration activity.
2024-07-19 16:08:39 +08:00
rules :
- alert : RegistrationsFew
2026-03-20 18:24:51 +08:00
expr : increase(user_login_total[1h]) == 0 # No successful login/registration events observed in 1 hour.
2024-07-19 16:08:39 +08:00
for : 1m
labels :
severity : info
annotations :
summary : "Too few registrations within the time frame"
description : "The number of registrations in the last hour is 0. There might be some issues."
2026-03-20 18:24:51 +08:00
- name : messages_few # Operational early-warning rule for unusually low messaging activity.
2024-07-19 16:08:39 +08:00
rules :
- alert : MessagesFew
2026-03-20 18:24:51 +08:00
expr : (increase(single_chat_msg_process_success_total[1h])+increase(group_chat_msg_process_success_total[1h])) == 0 # No successful single or group messages observed in 1 hour.
2024-07-19 16:08:39 +08:00
for : 1m
labels :
severity : info
annotations :
summary : "Too few messages within the time frame"
description : "The number of messages sent in the last hour is 0. There might be some issues."