Config: add new settings replacing hard-coded values

This mostly relates to timeouts. We are using minutes because it is human-friendly, but some could benefit in certain cases from using seconds. We are open to changing this in the future.
2023-12-11 19:09:07 +01:00 · 2023-12-11 19:09:07 +01:00 · 23134fa2d6
parent 45ed9bbed7
commit 23134fa2d6
4 changed files with 117 additions and 20 deletions
--- a/dotenv.example
+++ b/dotenv.example
@ -63,9 +63,21 @@ MYPYPATH="./src"
 # This includes the server private key and users' public keys.
 # # Alerts processing

-#NEW_INCIDENT_TIMEOUT="60"
+#ALERT_RESOLVE_MINUTES="5"
 #
-# Environment: NEW_INCIDENT_TIMEOUT.
+# Environment: ALERT_RESOLVE_MINUTES.
+# Prometheus keeps sending alerts periodically as long as it is aware
+# of them, and when it determines they have been resolved, it too
+# sends us that information.
+# However, under certain circumstances, an alert may be raised and it
+# can just stop being sent.
+# This variable marks how long we should remember such alerts that
+# stop arriving but are not explicitly marked as resolved.
+# Default value: 5 (minutes), as AlertManager
+
+#GROUP_INCIDENTS_MINUTES="60"
+#
+# Environment: GROUP_INCIDENTS_MINUTES.
 # Alerts incoming at roughly the same time are grouped into incidents.
 # Use this to configure the timeout after which alerts will be
 # considered separate incidents, in minutes.
@ -73,3 +85,35 @@ MYPYPATH="./src"
 # 09:20, 09:40, and 11:00, then the first 3 will be the same incident and
 # the last one will begin a new incident.
 # Default value: 60 (i.e. 1 hour).
+
+#MONITORING_DOWN_MINUTES="2"
+#
+# Environment: MONITORING_DOWN_MINUTES.
+# We expect prometheus to get in touch with AdlerManager periodically.
+# In order to achieve this, a special rule should be created (and
+# silenced in AlertManager!), which acts as a "Heartbeat" signal.
+# If we do not do this, monitoring may be down and we'll never get
+# see that.
+# The rule should look like this:
+#
+# # groups:
+# # - name: "AdlerManager at example.org"
+# # interval: 30s
+# # rules:
+# # - alert: "AdlerManager: Everything is fine"
+# #     # This expression should always be true.
+# #     # A neat trick is using the job that gathers prometheus metrics
+# #     expr: up{job="prometheus"} == 1
+# #     labels:
+# #         # This must match the sitename in AdlerManager
+# #         adlermanager: "example.org"
+# #         # This must be set
+# #         heartbeat: True
+# #         # This is optional, but it is nice to have
+# #         severity: OK
+#
+#
+# This variable determines how long we can go without having contact
+# from Prometheus. When this time is exceeded, AdlerManager will
+# fallback to a "Monitoring is Down" state.
+# Default value: 2 (minutes)
--- a/src/adlermanager/Config.py
+++ b/src/adlermanager/Config.py
@ -81,11 +81,26 @@ class ConfigClass(object):
    """

    # Alerts processing
-    new_incident_timeout: timedelta = attr.ib(
-        default=timedelta(minutes=int(os.getenv("NEW_INCIDENT_TIMEOUT", "60")))
+    alert_resolve_minutes: timedelta = attr.ib(
+        default=timedelta(minutes=int(os.getenv("ALERT_RESOLVE_MINUTES", "5")))
    )
    """
-    @param new_incident_timeout: Environment: NEW_INCIDENT_TIMEOUT.
+    @param alert_resolve_minutes: Environment: ALERT_RESOLVE_MINUTES.
+           Prometheus keeps sending alerts periodically as long as it is aware
+           of them, and when it determines they have been resolved, it too
+           sends us that information.
+           However, under certain circumstances, an alert may be raised and it
+           can just stop being sent.
+           This variable marks how long we should remember such alerts that
+           stop arriving but are not explicitly marked as resolved.
+           Default value: 5 (minutes), as AlertManager
+    """
+
+    group_incidents_minutes: timedelta = attr.ib(
+        default=timedelta(minutes=int(os.getenv("GROUP_INCIDENTS_MINUTES", "60")))
+    )
+    """
+    @param group_incidents_minutes: Environment: GROUP_INCIDENTS_MINUTES.
           Alerts incoming at roughly the same time are grouped into incidents.
           Use this to configure the timeout after which alerts will be
           considered separate incidents, in minutes.
@ -95,6 +110,41 @@ class ConfigClass(object):
           Default value: 60 (i.e. 1 hour).
    """

+    monitoring_down_minutes: timedelta = attr.ib(
+        default=timedelta(minutes=int(os.getenv("MONITORING_DOWN_MINUTES", "2")))
+    )
+    """
+    @param monitoring_down_minutes: Environment: MONITORING_DOWN_MINUTES.
+           We expect prometheus to get in touch with AdlerManager periodically.
+           In order to achieve this, a special rule should be created (and
+           silenced in AlertManager!), which acts as a "Heartbeat" signal.
+           If we do not do this, monitoring may be down and we'll never get
+           see that.
+           The rule should look like this:
+
+           # groups:
+           # - name: "AdlerManager at example.org"
+           # interval: 30s
+           # rules:
+           # - alert: "AdlerManager: Everything is fine"
+           #     # This expression should always be true.
+           #     # A neat trick is using the job that gathers prometheus metrics
+           #     expr: up{job="prometheus"} == 1
+           #     labels:
+           #         # This must match the sitename in AdlerManager
+           #         adlermanager: "example.org"
+           #         # This must be set
+           #         heartbeat: True
+           #         # This is optional, but it is nice to have
+           #         severity: OK
+
+
+           This variable determines how long we can go without having contact
+           from Prometheus. When this time is exceeded, AdlerManager will
+           fallback to a "Monitoring is Down" state.
+           Default value: 2 (minutes)
+    """
+

 Config = ConfigClass()

--- a/src/adlermanager/IncidentManager.py
+++ b/src/adlermanager/IncidentManager.py
@ -27,12 +27,13 @@ class IncidentManager(object):

    _monitoring_down: bool = attr.ib(default=False)

-    # _logs = attr.ib(factory=list)
-    # TODO: Get IncidentClosing timeout from settings?
-    #       Defaulting to 30m
-    _monitoring_grace_period: int = attr.ib(default=60 * 30)
-    #       Defaulting to 5m as alertmanager
-    _alert_resolve_timeout: int = attr.ib(default=5 * 60)
+    @property
+    def incident_grouping_seconds(self) -> float:
+        return self.global_config.group_incidents_minutes.total_seconds()
+
+    @property
+    def alert_resolve_seconds(self) -> float:
+        return self.global_config.alert_resolve_minutes.total_seconds()

    def __attrs_post_init__(self) -> None:
        if not self.path.isdir():
@ -63,7 +64,9 @@ class IncidentManager(object):
                self._monitoring_down = False
                # Monitoring is back up, re-activate timeout
                self._timeout = task.deferLater(
-                    reactor, self._monitoring_grace_period, self._expire  # type: ignore
+                    reactor,  # type: ignore
+                    self.incident_grouping_seconds,
+                    self._expire,
                )
                self.log_event("[Meta]MonitoringUp", timestamp)

@ -71,7 +74,7 @@ class IncidentManager(object):
        if alerts:
            self._timeout.cancel()
            self._timeout = task.deferLater(
-                reactor, self._monitoring_grace_period, self._expire  # type: ignore
+                reactor, self.incident_grouping_seconds, self._expire  # type: ignore
            ).addErrback(default_errback)
            self.last_alert = timestamp

@ -92,7 +95,7 @@ class IncidentManager(object):
                self.active_alerts[alert_label] = alert
            self._alert_timeouts[alert_label] = task.deferLater(
                reactor,  # type: ignore
-                self._alert_resolve_timeout,
+                self.alert_resolve_seconds,
                self._expire_alert,
                alert_label,
            ).addErrback(default_errback)
--- a/src/adlermanager/SitesManager.py
+++ b/src/adlermanager/SitesManager.py
@ -97,9 +97,10 @@ class SiteManager(object):

    _timeout: defer.Deferred[None] = attr.ib(factory=noop_deferred)
    site_name: str = attr.ib(default="")
-    # TODO: Get monitoring timeout from config
-    #       Default to 2 mins
-    _timeout_seconds = 2 * 60
+
+    @property
+    def monitoring_down_seconds(self) -> float:
+        return self.global_config.monitoring_down_minutes.total_seconds()

    log: Logger = attr.ib(factory=Logger)

@ -135,7 +136,7 @@ class SiteManager(object):
        # Add/reset monitoring timeout
        self._timeout.cancel()
        self._timeout = task.deferLater(
-            reactor, self._timeout_seconds, self.monitoring_down  # type: ignore
+            reactor, self.monitoring_down_seconds, self.monitoring_down  # type: ignore
        ).addErrback(default_errback)
        return self

@ -177,7 +178,7 @@ class SiteManager(object):
        self.monitoring_is_down = False
        self._timeout.cancel()
        self._timeout = task.deferLater(
-            reactor, self._timeout_seconds, self.monitoring_down  # type: ignore
+            reactor, self.monitoring_down_seconds, self.monitoring_down  # type: ignore
        ).addErrback(default_errback)

        # Filter alerts for this site
@ -228,7 +229,6 @@ class ServiceManager(object):
            self.definition.clear()
            self.definition.update(definition)
        self.label = self.definition["label"]
-        # TODO: Recover status after server restart
        self.component_labels.clear()
        self.component_labels.extend(
            [