feat: enhance configuration files with detailed comments for clarity

This commit is contained in:
icey-yu
2026-03-20 18:24:51 +08:00
parent 5028624fa3
commit 801ac740b7
15 changed files with 74 additions and 69 deletions
+14 -13
View File
@@ -1,30 +1,31 @@
# Default Prometheus alert groups for OpenIM.
groups:
- name: instance_down
- name: instance_down # Fires when a monitored target remains unreachable.
rules:
- alert: InstanceDown
expr: up == 0
for: 1m
expr: up == 0 # The built-in "up" metric is 0 when the latest scrape fails.
for: 1m # Trigger only if the condition remains true for more than 1 minute.
labels:
severity: critical
severity: critical # Used by Alertmanager for routing and notification priority.
annotations:
summary: "Instance {{ $labels.instance }} down"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minutes."
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minute."
- name: database_insert_failure_alerts
- name: database_insert_failure_alerts # Detects failures when persisting messages to Redis or MongoDB.
rules:
- alert: DatabaseInsertFailed
expr: (increase(msg_insert_redis_failed_total[5m]) > 0) or (increase(msg_insert_mongo_failed_total[5m]) > 0)
for: 1m
expr: (increase(msg_insert_redis_failed_total[5m]) > 0) or (increase(msg_insert_mongo_failed_total[5m]) > 0) # Any positive increase indicates write failures occurred in the last 5 minutes.
for: 1m # Avoid firing on very short spikes.
labels:
severity: critical
annotations:
summary: "Increase in MsgInsertRedisFailedCounter or MsgInsertMongoFailedCounter detected"
description: "Either MsgInsertRedisFailedCounter or MsgInsertMongoFailedCounter has increased in the last 5 minutes, indicating failures in message insert operations to Redis or MongoDB,maybe the redis or mongodb is crash."
description: "Either MsgInsertRedisFailedCounter or MsgInsertMongoFailedCounter increased in the last 5 minutes, indicating message insert failures to Redis or MongoDB and a possible backend outage."
- name: registrations_few
- name: registrations_few # Operational early-warning rule for unusually low login/registration activity.
rules:
- alert: RegistrationsFew
expr: increase(user_login_total[1h]) == 0
expr: increase(user_login_total[1h]) == 0 # No successful login/registration events observed in 1 hour.
for: 1m
labels:
severity: info
@@ -32,10 +33,10 @@ groups:
summary: "Too few registrations within the time frame"
description: "The number of registrations in the last hour is 0. There might be some issues."
- name: messages_few
- name: messages_few # Operational early-warning rule for unusually low messaging activity.
rules:
- alert: MessagesFew
expr: (increase(single_chat_msg_process_success_total[1h])+increase(group_chat_msg_process_success_total[1h])) == 0
expr: (increase(single_chat_msg_process_success_total[1h])+increase(group_chat_msg_process_success_total[1h])) == 0 # No successful single or group messages observed in 1 hour.
for: 1m
labels:
severity: info