Merge pull request #3707 from icey-yu/feat-comment-383

feat: enhance configuration files with detailed comments for clarity
This commit is contained in:
icey-yu
2026-03-20 18:26:37 +08:00
committed by GitHub
15 changed files with 74 additions and 69 deletions
+22 -20
View File
@@ -1,34 +1,36 @@
# Global Alertmanager runtime and SMTP settings.
global: global:
resolve_timeout: 5m resolve_timeout: 5m # Wait time before an alert is considered resolved when no further updates are received.
smtp_from: alert@openim.io smtp_from: alert@openim.io # Sender address displayed in alert emails.
smtp_smarthost: smtp.163.com:465 smtp_smarthost: smtp.163.com:465 # SMTP relay endpoint in host:port format.
smtp_auth_username: alert@openim.io smtp_auth_username: alert@openim.io # SMTP authentication username (commonly the same as smtp_from).
smtp_auth_password: YOURAUTHPASSWORD smtp_auth_password: YOURAUTHPASSWORD # SMTP authorization token or app password.
smtp_require_tls: false smtp_require_tls: false # Set to true when your SMTP provider requires STARTTLS.
smtp_hello: xxx smtp_hello: xxx # HELO/EHLO identity presented to the SMTP server.
templates: templates:
- /etc/alertmanager/email.tmpl - /etc/alertmanager/email.tmpl # Go template file used to render HTML email content.
# Root routing tree for all incoming alerts.
route: route:
group_by: [ 'alertname' ] group_by: [ 'alertname' ] # Alerts sharing this label value are batched into one notification.
group_wait: 5s group_wait: 5s # Initial delay before sending the first notification for a new alert group.
group_interval: 5s group_interval: 5s # Minimum interval between notifications for the same alert group.
repeat_interval: 5m repeat_interval: 5m # Reminder interval while an alert group remains firing.
receiver: email receiver: email # Default receiver when no child route matches.
routes: routes:
- matchers: - matchers:
- alertname = "XXX" - alertname = "XXX" # Example matcher; replace with a real alert name or remove this route.
group_by: [ 'instance' ] group_by: [ 'instance' ] # Override grouping for this specific route.
group_wait: 5s group_wait: 5s
group_interval: 5s group_interval: 5s
repeat_interval: 5m repeat_interval: 5m
receiver: email receiver: email
receivers: receivers:
- name: email - name: email # Receiver name referenced by route.receiver.
email_configs: email_configs:
- to: 'alert@example.com' - to: 'alert@example.com' # Recipient mailbox for alert notifications.
html: '{{ template "email.to.html" . }}' html: '{{ template "email.to.html" . }}' # Rendered with the template declared in email.tmpl.
headers: { Subject: "[OPENIM-SERVER]Alarm" } headers: { Subject: "[OPENIM-SERVER]Alarm" } # Custom email subject line.
send_resolved: true send_resolved: true # Also send a notification when the alert recovers.
+3
View File
@@ -1,3 +1,6 @@
{{/* OpenIM Alertmanager email template.
This template renders both firing and resolved alerts.
Each alert entry reads labels and annotations from Prometheus rule definitions. */}}
{{ define "email.to.html" }} {{ define "email.to.html" }}
{{ if eq .Status "firing" }} {{ if eq .Status "firing" }}
{{ range .Alerts }} {{ range .Alerts }}
+14 -13
View File
@@ -1,30 +1,31 @@
# Default Prometheus alert groups for OpenIM.
groups: groups:
- name: instance_down - name: instance_down # Fires when a monitored target remains unreachable.
rules: rules:
- alert: InstanceDown - alert: InstanceDown
expr: up == 0 expr: up == 0 # The built-in "up" metric is 0 when the latest scrape fails.
for: 1m for: 1m # Trigger only if the condition remains true for more than 1 minute.
labels: labels:
severity: critical severity: critical # Used by Alertmanager for routing and notification priority.
annotations: annotations:
summary: "Instance {{ $labels.instance }} down" summary: "Instance {{ $labels.instance }} down"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minutes." description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minute."
- name: database_insert_failure_alerts - name: database_insert_failure_alerts # Detects failures when persisting messages to Redis or MongoDB.
rules: rules:
- alert: DatabaseInsertFailed - alert: DatabaseInsertFailed
expr: (increase(msg_insert_redis_failed_total[5m]) > 0) or (increase(msg_insert_mongo_failed_total[5m]) > 0) expr: (increase(msg_insert_redis_failed_total[5m]) > 0) or (increase(msg_insert_mongo_failed_total[5m]) > 0) # Any positive increase indicates write failures occurred in the last 5 minutes.
for: 1m for: 1m # Avoid firing on very short spikes.
labels: labels:
severity: critical severity: critical
annotations: annotations:
summary: "Increase in MsgInsertRedisFailedCounter or MsgInsertMongoFailedCounter detected" summary: "Increase in MsgInsertRedisFailedCounter or MsgInsertMongoFailedCounter detected"
description: "Either MsgInsertRedisFailedCounter or MsgInsertMongoFailedCounter has increased in the last 5 minutes, indicating failures in message insert operations to Redis or MongoDB,maybe the redis or mongodb is crash." description: "Either MsgInsertRedisFailedCounter or MsgInsertMongoFailedCounter increased in the last 5 minutes, indicating message insert failures to Redis or MongoDB and a possible backend outage."
- name: registrations_few - name: registrations_few # Operational early-warning rule for unusually low login/registration activity.
rules: rules:
- alert: RegistrationsFew - alert: RegistrationsFew
expr: increase(user_login_total[1h]) == 0 expr: increase(user_login_total[1h]) == 0 # No successful login/registration events observed in 1 hour.
for: 1m for: 1m
labels: labels:
severity: info severity: info
@@ -32,10 +33,10 @@ groups:
summary: "Too few registrations within the time frame" summary: "Too few registrations within the time frame"
description: "The number of registrations in the last hour is 0. There might be some issues." description: "The number of registrations in the last hour is 0. There might be some issues."
- name: messages_few - name: messages_few # Operational early-warning rule for unusually low messaging activity.
rules: rules:
- alert: MessagesFew - alert: MessagesFew
expr: (increase(single_chat_msg_process_success_total[1h])+increase(group_chat_msg_process_success_total[1h])) == 0 expr: (increase(single_chat_msg_process_success_total[1h])+increase(group_chat_msg_process_success_total[1h])) == 0 # No successful single or group messages observed in 1 hour.
for: 1m for: 1m
labels: labels:
severity: info severity: info
+1 -1
View File
@@ -8,7 +8,7 @@ api:
prometheus: prometheus:
# Whether to enable prometheus # Enable Prometheus metrics exposure for this service; set to true to allow scraping.
enable: true enable: true
# autoSetPorts indicates whether to automatically set the ports # autoSetPorts indicates whether to automatically set the ports
autoSetPorts: true autoSetPorts: true
+1 -1
View File
@@ -8,7 +8,7 @@ rpc:
ports: [ 10140, 10141, 10142, 10143, 10144, 10145, 10146, 10147, 10148, 10149, 10150, 10151, 10152, 10153, 10154, 10155 ] ports: [ 10140, 10141, 10142, 10143, 10144, 10145, 10146, 10147, 10148, 10149, 10150, 10151, 10152, 10153, 10154, 10155 ]
prometheus: prometheus:
# Enable or disable Prometheus monitoring # Enable Prometheus metrics exposure for this service; set to true to allow scraping.
enable: true enable: true
# List of ports that Prometheus listens on; these must match the number of rpc.ports to ensure correct monitoring setup # List of ports that Prometheus listens on; these must match the number of rpc.ports to ensure correct monitoring setup
# It will only take effect when autoSetPorts is set to false. # It will only take effect when autoSetPorts is set to false.
+1 -1
View File
@@ -1,5 +1,5 @@
prometheus: prometheus:
# Enable or disable Prometheus monitoring # Enable Prometheus metrics exposure for this service; set to true to allow scraping.
enable: true enable: true
# autoSetPorts indicates whether to automatically set the ports # autoSetPorts indicates whether to automatically set the ports
autoSetPorts: true autoSetPorts: true
+1 -1
View File
@@ -10,7 +10,7 @@ rpc:
ports: [ 10170, 10171, 10172, 10173, 10174, 10175, 10176, 10177, 10178, 10179, 10180, 10181, 10182, 10183, 10184, 10185 ] ports: [ 10170, 10171, 10172, 10173, 10174, 10175, 10176, 10177, 10178, 10179, 10180, 10181, 10182, 10183, 10184, 10185 ]
prometheus: prometheus:
# Enable or disable Prometheus monitoring # Enable Prometheus metrics exposure for this service; set to true to allow scraping.
enable: true enable: true
# List of ports that Prometheus listens on; these must match the number of rpc.ports to ensure correct monitoring setup # List of ports that Prometheus listens on; these must match the number of rpc.ports to ensure correct monitoring setup
# It will only take effect when autoSetPorts is set to false. # It will only take effect when autoSetPorts is set to false.
+1 -1
View File
@@ -10,7 +10,7 @@ rpc:
ports: [ 10200 ] ports: [ 10200 ]
prometheus: prometheus:
# Enable or disable Prometheus monitoring # Enable Prometheus metrics exposure for this service; set to true to allow scraping.
enable: true enable: true
# List of ports that Prometheus listens on; these must match the number of rpc.ports to ensure correct monitoring setup # List of ports that Prometheus listens on; these must match the number of rpc.ports to ensure correct monitoring setup
# It will only take effect when autoSetPorts is set to false. # It will only take effect when autoSetPorts is set to false.
+1 -1
View File
@@ -10,7 +10,7 @@ rpc:
ports: [ 10220 ] ports: [ 10220 ]
prometheus: prometheus:
# Enable or disable Prometheus monitoring # Enable Prometheus metrics exposure for this service; set to true to allow scraping.
enable: true enable: true
# List of ports that Prometheus listens on; these must match the number of rpc.ports to ensure correct monitoring setup # List of ports that Prometheus listens on; these must match the number of rpc.ports to ensure correct monitoring setup
# It will only take effect when autoSetPorts is set to false. # It will only take effect when autoSetPorts is set to false.
+1 -1
View File
@@ -10,7 +10,7 @@ rpc:
ports: [ 10240 ] ports: [ 10240 ]
prometheus: prometheus:
# Enable or disable Prometheus monitoring # Enable Prometheus metrics exposure for this service; set to true to allow scraping.
enable: true enable: true
# List of ports that Prometheus listens on; these must match the number of rpc.ports to ensure correct monitoring setup # List of ports that Prometheus listens on; these must match the number of rpc.ports to ensure correct monitoring setup
# It will only take effect when autoSetPorts is set to false. # It will only take effect when autoSetPorts is set to false.
+1 -1
View File
@@ -10,7 +10,7 @@ rpc:
ports: [ 10260 ] ports: [ 10260 ]
prometheus: prometheus:
# Enable or disable Prometheus monitoring # Enable Prometheus metrics exposure for this service; set to true to allow scraping.
enable: true enable: true
# List of ports that Prometheus listens on; these must match the number of rpc.ports to ensure correct monitoring setup # List of ports that Prometheus listens on; these must match the number of rpc.ports to ensure correct monitoring setup
# It will only take effect when autoSetPorts is set to false. # It will only take effect when autoSetPorts is set to false.
+1 -1
View File
@@ -10,7 +10,7 @@ rpc:
ports: [ 10280 ] ports: [ 10280 ]
prometheus: prometheus:
# Enable or disable Prometheus monitoring # Enable Prometheus metrics exposure for this service; set to true to allow scraping.
enable: true enable: true
# List of ports that Prometheus listens on; these must match the number of rpc.ports to ensure correct monitoring setup # List of ports that Prometheus listens on; these must match the number of rpc.ports to ensure correct monitoring setup
# It will only take effect when autoSetPorts is set to false. # It will only take effect when autoSetPorts is set to false.
+1 -1
View File
@@ -10,7 +10,7 @@ rpc:
ports: [ 10300 ] ports: [ 10300 ]
prometheus: prometheus:
# Enable or disable Prometheus monitoring # Enable Prometheus metrics exposure for this service; set to true to allow scraping.
enable: true enable: true
# List of ports that Prometheus listens on; these must match the number of rpc.ports to ensure correct monitoring setup # List of ports that Prometheus listens on; these must match the number of rpc.ports to ensure correct monitoring setup
# It will only take effect when autoSetPorts is set to false. # It will only take effect when autoSetPorts is set to false.
+1 -1
View File
@@ -10,7 +10,7 @@ rpc:
ports: [ 10320 ] ports: [ 10320 ]
prometheus: prometheus:
# Whether to enable prometheus # Enable Prometheus metrics exposure for this service; set to true to allow scraping.
enable: true enable: true
# Prometheus listening ports, must be consistent with the number of rpc.ports # Prometheus listening ports, must be consistent with the number of rpc.ports
# It will only take effect when autoSetPorts is set to false. # It will only take effect when autoSetPorts is set to false.
+23 -24
View File
@@ -1,35 +1,34 @@
# my global config # Global Prometheus runtime settings.
global: global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute. scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute. evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s). # scrape_timeout defaults to 10s unless overridden in a specific scrape job.
# Alertmanager configuration # Alertmanager endpoints that receive alert events from Prometheus.
alerting: alerting:
alertmanagers: alertmanagers:
- static_configs: - static_configs:
- targets: [127.0.0.1:19093] - targets: [127.0.0.1:19093] # Alertmanager address in host:port format.
# Load rules once and periodically evaluate them according to the global evaluation_interval. # Rule files loaded by Prometheus.
rule_files: rule_files:
- instance-down-rules.yml - instance-down-rules.yml # Default OpenIM alert rules; add more files here if needed.
# - first_rules.yml # - first_rules.yml
# - second_rules.yml # - second_rules.yml
# A scrape configuration containing exactly one endpoint to scrape: # Scrape jobs used to collect infrastructure and OpenIM service metrics.
# Here it's Prometheus itself.
scrape_configs: scrape_configs:
# The job name is added as a label "job=job_name" to any timeseries scraped from this config. # The job_name value is attached as the "job" label in collected time series.
# Monitored information captured by prometheus
# prometheus fetches application services
- job_name: node_exporter - job_name: node_exporter
static_configs: static_configs:
- targets: [ 127.0.0.1:19100 ] - targets: [ 127.0.0.1:19100 ] # node_exporter endpoint for host CPU, memory, disk, and network metrics.
# OpenIM services are discovered dynamically from the admin API.
# For multi-host deployments, replace 127.0.0.1 with a reachable internal address.
- job_name: openimserver-openim-api - job_name: openimserver-openim-api
http_sd_configs: http_sd_configs:
- url: "http://127.0.0.1:10002/prometheus_discovery/api" - url: "http://127.0.0.1:10002/prometheus_discovery/api" # Service discovery endpoint for OpenIM API instances.
# static_configs: # static_configs:
# - targets: [ 127.0.0.1:12002 ] # - targets: [ 127.0.0.1:12002 ]
# labels: # labels:
@@ -37,7 +36,7 @@ scrape_configs:
- job_name: openimserver-openim-msggateway - job_name: openimserver-openim-msggateway
http_sd_configs: http_sd_configs:
- url: "http://127.0.0.1:10002/prometheus_discovery/msg_gateway" - url: "http://127.0.0.1:10002/prometheus_discovery/msg_gateway" # Service discovery endpoint for msggateway instances.
# static_configs: # static_configs:
# - targets: [ 127.0.0.1:12140 ] # - targets: [ 127.0.0.1:12140 ]
# # - targets: [ 127.0.0.1:12140, 127.0.0.1:12141, 127.0.0.1:12142, 127.0.0.1:12143, 127.0.0.1:12144, 127.0.0.1:12145, 127.0.0.1:12146, 127.0.0.1:12147, 127.0.0.1:12148, 127.0.0.1:12149, 127.0.0.1:12150, 127.0.0.1:12151, 127.0.0.1:12152, 127.0.0.1:12153, 127.0.0.1:12154, 127.0.0.1:12155 ] # # - targets: [ 127.0.0.1:12140, 127.0.0.1:12141, 127.0.0.1:12142, 127.0.0.1:12143, 127.0.0.1:12144, 127.0.0.1:12145, 127.0.0.1:12146, 127.0.0.1:12147, 127.0.0.1:12148, 127.0.0.1:12149, 127.0.0.1:12150, 127.0.0.1:12151, 127.0.0.1:12152, 127.0.0.1:12153, 127.0.0.1:12154, 127.0.0.1:12155 ]
@@ -46,7 +45,7 @@ scrape_configs:
- job_name: openimserver-openim-msgtransfer - job_name: openimserver-openim-msgtransfer
http_sd_configs: http_sd_configs:
- url: "http://127.0.0.1:10002/prometheus_discovery/msg_transfer" - url: "http://127.0.0.1:10002/prometheus_discovery/msg_transfer" # Service discovery endpoint for msgtransfer instances.
# static_configs: # static_configs:
# - targets: [ 127.0.0.1:12020, 127.0.0.1:12021, 127.0.0.1:12022, 127.0.0.1:12023, 127.0.0.1:12024, 127.0.0.1:12025, 127.0.0.1:12026, 127.0.0.1:12027 ] # - targets: [ 127.0.0.1:12020, 127.0.0.1:12021, 127.0.0.1:12022, 127.0.0.1:12023, 127.0.0.1:12024, 127.0.0.1:12025, 127.0.0.1:12026, 127.0.0.1:12027 ]
# # - targets: [ 127.0.0.1:12020, 127.0.0.1:12021, 127.0.0.1:12022, 127.0.0.1:12023, 127.0.0.1:12024, 127.0.0.1:12025, 127.0.0.1:12026, 127.0.0.1:12027, 127.0.0.1:12028, 127.0.0.1:12029, 127.0.0.1:12030, 127.0.0.1:12031, 127.0.0.1:12032, 127.0.0.1:12033, 127.0.0.1:12034, 127.0.0.1:12035 ] # # - targets: [ 127.0.0.1:12020, 127.0.0.1:12021, 127.0.0.1:12022, 127.0.0.1:12023, 127.0.0.1:12024, 127.0.0.1:12025, 127.0.0.1:12026, 127.0.0.1:12027, 127.0.0.1:12028, 127.0.0.1:12029, 127.0.0.1:12030, 127.0.0.1:12031, 127.0.0.1:12032, 127.0.0.1:12033, 127.0.0.1:12034, 127.0.0.1:12035 ]
@@ -55,7 +54,7 @@ scrape_configs:
- job_name: openimserver-openim-push - job_name: openimserver-openim-push
http_sd_configs: http_sd_configs:
- url: "http://127.0.0.1:10002/prometheus_discovery/push" - url: "http://127.0.0.1:10002/prometheus_discovery/push" # Service discovery endpoint for push service instances.
# static_configs: # static_configs:
# - targets: [ 127.0.0.1:12170, 127.0.0.1:12171, 127.0.0.1:12172, 127.0.0.1:12173, 127.0.0.1:12174, 127.0.0.1:12175, 127.0.0.1:12176, 127.0.0.1:12177 ] # - targets: [ 127.0.0.1:12170, 127.0.0.1:12171, 127.0.0.1:12172, 127.0.0.1:12173, 127.0.0.1:12174, 127.0.0.1:12175, 127.0.0.1:12176, 127.0.0.1:12177 ]
## - targets: [ 127.0.0.1:12170, 127.0.0.1:12171, 127.0.0.1:12172, 127.0.0.1:12173, 127.0.0.1:12174, 127.0.0.1:12175, 127.0.0.1:12176, 127.0.0.1:12177, 127.0.0.1:12178, 127.0.0.1:12179, 127.0.0.1:12180, 127.0.0.1:12182, 127.0.0.1:12183, 127.0.0.1:12184, 127.0.0.1:12185, 127.0.0.1:12186 ] ## - targets: [ 127.0.0.1:12170, 127.0.0.1:12171, 127.0.0.1:12172, 127.0.0.1:12173, 127.0.0.1:12174, 127.0.0.1:12175, 127.0.0.1:12176, 127.0.0.1:12177, 127.0.0.1:12178, 127.0.0.1:12179, 127.0.0.1:12180, 127.0.0.1:12182, 127.0.0.1:12183, 127.0.0.1:12184, 127.0.0.1:12185, 127.0.0.1:12186 ]
@@ -64,7 +63,7 @@ scrape_configs:
- job_name: openimserver-openim-rpc-auth - job_name: openimserver-openim-rpc-auth
http_sd_configs: http_sd_configs:
- url: "http://127.0.0.1:10002/prometheus_discovery/auth" - url: "http://127.0.0.1:10002/prometheus_discovery/auth" # Service discovery endpoint for auth RPC instances.
# static_configs: # static_configs:
# - targets: [ 127.0.0.1:12200 ] # - targets: [ 127.0.0.1:12200 ]
# labels: # labels:
@@ -72,7 +71,7 @@ scrape_configs:
- job_name: openimserver-openim-rpc-conversation - job_name: openimserver-openim-rpc-conversation
http_sd_configs: http_sd_configs:
- url: "http://127.0.0.1:10002/prometheus_discovery/conversation" - url: "http://127.0.0.1:10002/prometheus_discovery/conversation" # Service discovery endpoint for conversation RPC instances.
# static_configs: # static_configs:
# - targets: [ 127.0.0.1:12220 ] # - targets: [ 127.0.0.1:12220 ]
# labels: # labels:
@@ -80,7 +79,7 @@ scrape_configs:
- job_name: openimserver-openim-rpc-friend - job_name: openimserver-openim-rpc-friend
http_sd_configs: http_sd_configs:
- url: "http://127.0.0.1:10002/prometheus_discovery/friend" - url: "http://127.0.0.1:10002/prometheus_discovery/friend" # Service discovery endpoint for friend RPC instances.
# static_configs: # static_configs:
# - targets: [ 127.0.0.1:12240 ] # - targets: [ 127.0.0.1:12240 ]
# labels: # labels:
@@ -88,7 +87,7 @@ scrape_configs:
- job_name: openimserver-openim-rpc-group - job_name: openimserver-openim-rpc-group
http_sd_configs: http_sd_configs:
- url: "http://127.0.0.1:10002/prometheus_discovery/group" - url: "http://127.0.0.1:10002/prometheus_discovery/group" # Service discovery endpoint for group RPC instances.
# static_configs: # static_configs:
# - targets: [ 127.0.0.1:12260 ] # - targets: [ 127.0.0.1:12260 ]
# labels: # labels:
@@ -96,7 +95,7 @@ scrape_configs:
- job_name: openimserver-openim-rpc-msg - job_name: openimserver-openim-rpc-msg
http_sd_configs: http_sd_configs:
- url: "http://127.0.0.1:10002/prometheus_discovery/msg" - url: "http://127.0.0.1:10002/prometheus_discovery/msg" # Service discovery endpoint for msg RPC instances.
# static_configs: # static_configs:
# - targets: [ 127.0.0.1:12280 ] # - targets: [ 127.0.0.1:12280 ]
# labels: # labels:
@@ -104,7 +103,7 @@ scrape_configs:
- job_name: openimserver-openim-rpc-third - job_name: openimserver-openim-rpc-third
http_sd_configs: http_sd_configs:
- url: "http://127.0.0.1:10002/prometheus_discovery/third" - url: "http://127.0.0.1:10002/prometheus_discovery/third" # Service discovery endpoint for third-party RPC instances.
# static_configs: # static_configs:
# - targets: [ 127.0.0.1:12300 ] # - targets: [ 127.0.0.1:12300 ]
# labels: # labels:
@@ -112,7 +111,7 @@ scrape_configs:
- job_name: openimserver-openim-rpc-user - job_name: openimserver-openim-rpc-user
http_sd_configs: http_sd_configs:
- url: "http://127.0.0.1:10002/prometheus_discovery/user" - url: "http://127.0.0.1:10002/prometheus_discovery/user" # Service discovery endpoint for user RPC instances.
# static_configs: # static_configs:
# - targets: [ 127.0.0.1:12320 ] # - targets: [ 127.0.0.1:12320 ]
# labels: # labels: