Config: add new settings replacing hard-coded values
All checks were successful
ci/woodpecker/push/linters Pipeline was successful

This mostly relates to timeouts.

We are using minutes because it is human-friendly, but some could
benefit in certain cases from using seconds.

We are open to changing this in the future.
This commit is contained in:
Evilham 2023-12-11 19:09:07 +01:00
parent 45ed9bbed7
commit 23134fa2d6
Signed by: evilham
GPG key ID: AE3EE30D970886BF
4 changed files with 117 additions and 20 deletions

View file

@ -63,9 +63,21 @@ MYPYPATH="./src"
# This includes the server private key and users' public keys.
# # Alerts processing
#NEW_INCIDENT_TIMEOUT="60"
#ALERT_RESOLVE_MINUTES="5"
#
# Environment: NEW_INCIDENT_TIMEOUT.
# Environment: ALERT_RESOLVE_MINUTES.
# Prometheus keeps sending alerts periodically as long as it is aware
# of them, and when it determines they have been resolved, it too
# sends us that information.
# However, under certain circumstances, an alert may be raised and it
# can just stop being sent.
# This variable marks how long we should remember such alerts that
# stop arriving but are not explicitly marked as resolved.
# Default value: 5 (minutes), as AlertManager
#GROUP_INCIDENTS_MINUTES="60"
#
# Environment: GROUP_INCIDENTS_MINUTES.
# Alerts incoming at roughly the same time are grouped into incidents.
# Use this to configure the timeout after which alerts will be
# considered separate incidents, in minutes.
@ -73,3 +85,35 @@ MYPYPATH="./src"
# 09:20, 09:40, and 11:00, then the first 3 will be the same incident and
# the last one will begin a new incident.
# Default value: 60 (i.e. 1 hour).
#MONITORING_DOWN_MINUTES="2"
#
# Environment: MONITORING_DOWN_MINUTES.
# We expect prometheus to get in touch with AdlerManager periodically.
# In order to achieve this, a special rule should be created (and
# silenced in AlertManager!), which acts as a "Heartbeat" signal.
# If we do not do this, monitoring may be down and we'll never get
# see that.
# The rule should look like this:
#
# # groups:
# # - name: "AdlerManager at example.org"
# # interval: 30s
# # rules:
# # - alert: "AdlerManager: Everything is fine"
# # # This expression should always be true.
# # # A neat trick is using the job that gathers prometheus metrics
# # expr: up{job="prometheus"} == 1
# # labels:
# # # This must match the sitename in AdlerManager
# # adlermanager: "example.org"
# # # This must be set
# # heartbeat: True
# # # This is optional, but it is nice to have
# # severity: OK
#
#
# This variable determines how long we can go without having contact
# from Prometheus. When this time is exceeded, AdlerManager will
# fallback to a "Monitoring is Down" state.
# Default value: 2 (minutes)

View file

@ -81,11 +81,26 @@ class ConfigClass(object):
"""
# Alerts processing
new_incident_timeout: timedelta = attr.ib(
default=timedelta(minutes=int(os.getenv("NEW_INCIDENT_TIMEOUT", "60")))
alert_resolve_minutes: timedelta = attr.ib(
default=timedelta(minutes=int(os.getenv("ALERT_RESOLVE_MINUTES", "5")))
)
"""
@param new_incident_timeout: Environment: NEW_INCIDENT_TIMEOUT.
@param alert_resolve_minutes: Environment: ALERT_RESOLVE_MINUTES.
Prometheus keeps sending alerts periodically as long as it is aware
of them, and when it determines they have been resolved, it too
sends us that information.
However, under certain circumstances, an alert may be raised and it
can just stop being sent.
This variable marks how long we should remember such alerts that
stop arriving but are not explicitly marked as resolved.
Default value: 5 (minutes), as AlertManager
"""
group_incidents_minutes: timedelta = attr.ib(
default=timedelta(minutes=int(os.getenv("GROUP_INCIDENTS_MINUTES", "60")))
)
"""
@param group_incidents_minutes: Environment: GROUP_INCIDENTS_MINUTES.
Alerts incoming at roughly the same time are grouped into incidents.
Use this to configure the timeout after which alerts will be
considered separate incidents, in minutes.
@ -95,6 +110,41 @@ class ConfigClass(object):
Default value: 60 (i.e. 1 hour).
"""
monitoring_down_minutes: timedelta = attr.ib(
default=timedelta(minutes=int(os.getenv("MONITORING_DOWN_MINUTES", "2")))
)
"""
@param monitoring_down_minutes: Environment: MONITORING_DOWN_MINUTES.
We expect prometheus to get in touch with AdlerManager periodically.
In order to achieve this, a special rule should be created (and
silenced in AlertManager!), which acts as a "Heartbeat" signal.
If we do not do this, monitoring may be down and we'll never get
see that.
The rule should look like this:
# groups:
# - name: "AdlerManager at example.org"
# interval: 30s
# rules:
# - alert: "AdlerManager: Everything is fine"
# # This expression should always be true.
# # A neat trick is using the job that gathers prometheus metrics
# expr: up{job="prometheus"} == 1
# labels:
# # This must match the sitename in AdlerManager
# adlermanager: "example.org"
# # This must be set
# heartbeat: True
# # This is optional, but it is nice to have
# severity: OK
This variable determines how long we can go without having contact
from Prometheus. When this time is exceeded, AdlerManager will
fallback to a "Monitoring is Down" state.
Default value: 2 (minutes)
"""
Config = ConfigClass()

View file

@ -27,12 +27,13 @@ class IncidentManager(object):
_monitoring_down: bool = attr.ib(default=False)
# _logs = attr.ib(factory=list)
# TODO: Get IncidentClosing timeout from settings?
# Defaulting to 30m
_monitoring_grace_period: int = attr.ib(default=60 * 30)
# Defaulting to 5m as alertmanager
_alert_resolve_timeout: int = attr.ib(default=5 * 60)
@property
def incident_grouping_seconds(self) -> float:
return self.global_config.group_incidents_minutes.total_seconds()
@property
def alert_resolve_seconds(self) -> float:
return self.global_config.alert_resolve_minutes.total_seconds()
def __attrs_post_init__(self) -> None:
if not self.path.isdir():
@ -63,7 +64,9 @@ class IncidentManager(object):
self._monitoring_down = False
# Monitoring is back up, re-activate timeout
self._timeout = task.deferLater(
reactor, self._monitoring_grace_period, self._expire # type: ignore
reactor, # type: ignore
self.incident_grouping_seconds,
self._expire,
)
self.log_event("[Meta]MonitoringUp", timestamp)
@ -71,7 +74,7 @@ class IncidentManager(object):
if alerts:
self._timeout.cancel()
self._timeout = task.deferLater(
reactor, self._monitoring_grace_period, self._expire # type: ignore
reactor, self.incident_grouping_seconds, self._expire # type: ignore
).addErrback(default_errback)
self.last_alert = timestamp
@ -92,7 +95,7 @@ class IncidentManager(object):
self.active_alerts[alert_label] = alert
self._alert_timeouts[alert_label] = task.deferLater(
reactor, # type: ignore
self._alert_resolve_timeout,
self.alert_resolve_seconds,
self._expire_alert,
alert_label,
).addErrback(default_errback)

View file

@ -97,9 +97,10 @@ class SiteManager(object):
_timeout: defer.Deferred[None] = attr.ib(factory=noop_deferred)
site_name: str = attr.ib(default="")
# TODO: Get monitoring timeout from config
# Default to 2 mins
_timeout_seconds = 2 * 60
@property
def monitoring_down_seconds(self) -> float:
return self.global_config.monitoring_down_minutes.total_seconds()
log: Logger = attr.ib(factory=Logger)
@ -135,7 +136,7 @@ class SiteManager(object):
# Add/reset monitoring timeout
self._timeout.cancel()
self._timeout = task.deferLater(
reactor, self._timeout_seconds, self.monitoring_down # type: ignore
reactor, self.monitoring_down_seconds, self.monitoring_down # type: ignore
).addErrback(default_errback)
return self
@ -177,7 +178,7 @@ class SiteManager(object):
self.monitoring_is_down = False
self._timeout.cancel()
self._timeout = task.deferLater(
reactor, self._timeout_seconds, self.monitoring_down # type: ignore
reactor, self.monitoring_down_seconds, self.monitoring_down # type: ignore
).addErrback(default_errback)
# Filter alerts for this site
@ -228,7 +229,6 @@ class ServiceManager(object):
self.definition.clear()
self.definition.update(definition)
self.label = self.definition["label"]
# TODO: Recover status after server restart
self.component_labels.clear()
self.component_labels.extend(
[