Allow config of the backoff algorithm for the federation client. (#15754)

Adds three new configuration variables: * destination_min_retry_interval is identical to before (10mn). * destination_retry_multiplier is now 2 instead of 5, the maximum value will be reached slower. * destination_max_retry_interval is one day instead of (essentially) infinity. Capping this will cause destinations to continue to be retried sometimes instead of being lost forever. The previous value was 2 ^ 62 milliseconds.
2025-03-28 02:18:11 +00:00 · 2023-08-03 20:36:55 +02:00 · 2023-08-03 20:36:55 +02:00 · f0a860908b
commit f0a860908b
parent 9c462f18a4
6 changed files with 64 additions and 26 deletions
--- a/changelog.d/15754.misc
+++ b/changelog.d/15754.misc
@ -0,0 +1 @@
+Allow for the configuration of the backoff algorithm for federation destinations.
--- a/docs/usage/configuration/config_documentation.md
+++ b/docs/usage/configuration/config_documentation.md
@ -1242,6 +1242,14 @@ like sending a federation transaction.
 * `max_short_retries`: maximum number of retries for the short retry algo. Default to 3 attempts.
 * `max_long_retries`: maximum number of retries for the long retry algo. Default to 10 attempts.

+The following options control the retry logic when communicating with a specific homeserver destination.
+Unlike the previous configuration options, these values apply across all requests
+for a given destination and the state of the backoff is stored in the database.
+
+* `destination_min_retry_interval`: the initial backoff, after the first request fails. Defaults to 10m.
+* `destination_retry_multiplier`: how much we multiply the backoff by after each subsequent fail. Defaults to 2.
+* `destination_max_retry_interval`: a cap on the backoff. Defaults to a week.
+
 Example configuration:
 ```yaml
 federation:
@ -1250,6 +1258,9 @@ federation:
  max_long_retry_delay: 100s
  max_short_retries: 5
  max_long_retries: 20
+  destination_min_retry_interval: 30s
+  destination_retry_multiplier: 5
+  destination_max_retry_interval: 12h
 ```
 ---
 ## Caching
--- a/synapse/config/federation.py
+++ b/synapse/config/federation.py
@ -65,5 +65,23 @@ class FederationConfig(Config):
        self.max_long_retries = federation_config.get("max_long_retries", 10)
        self.max_short_retries = federation_config.get("max_short_retries", 3)

+        # Allow for the configuration of the backoff algorithm used
+        # when trying to reach an unavailable destination.
+        # Unlike previous configuration those values applies across
+        # multiple requests and the state of the backoff is stored on DB.
+        self.destination_min_retry_interval_ms = Config.parse_duration(
+            federation_config.get("destination_min_retry_interval", "10m")
+        )
+        self.destination_retry_multiplier = federation_config.get(
+            "destination_retry_multiplier", 2
+        )
+        self.destination_max_retry_interval_ms = min(
+            Config.parse_duration(
+                federation_config.get("destination_max_retry_interval", "7d")
+            ),
+            # Set a hard-limit to not overflow the database column.
+            2**62,
+        )
+

 _METRICS_FOR_DOMAINS_SCHEMA = {"type": "array", "items": {"type": "string"}}
--- a/synapse/util/retryutils.py
+++ b/synapse/util/retryutils.py
@ -27,15 +27,6 @@ if TYPE_CHECKING:

 logger = logging.getLogger(__name__)

-# the initial backoff, after the first transaction fails
-MIN_RETRY_INTERVAL = 10 * 60 * 1000
-
-# how much we multiply the backoff by after each subsequent fail
-RETRY_MULTIPLIER = 5
-
-# a cap on the backoff. (Essentially none)
-MAX_RETRY_INTERVAL = 2**62
-

 class NotRetryingDestination(Exception):
    def __init__(self, retry_last_ts: int, retry_interval: int, destination: str):
@ -169,6 +160,16 @@ class RetryDestinationLimiter:
        self.notifier = notifier
        self.replication_client = replication_client

+        self.destination_min_retry_interval_ms = (
+            self.store.hs.config.federation.destination_min_retry_interval_ms
+        )
+        self.destination_retry_multiplier = (
+            self.store.hs.config.federation.destination_retry_multiplier
+        )
+        self.destination_max_retry_interval_ms = (
+            self.store.hs.config.federation.destination_max_retry_interval_ms
+        )
+
    def __enter__(self) -> None:
        pass

@ -220,13 +221,15 @@ class RetryDestinationLimiter:
            # We couldn't connect.
            if self.retry_interval:
                self.retry_interval = int(
-                    self.retry_interval * RETRY_MULTIPLIER * random.uniform(0.8, 1.4)
+                    self.retry_interval
+                    * self.destination_retry_multiplier
+                    * random.uniform(0.8, 1.4)
                )

-                if self.retry_interval >= MAX_RETRY_INTERVAL:
-                    self.retry_interval = MAX_RETRY_INTERVAL
+                if self.retry_interval >= self.destination_max_retry_interval_ms:
+                    self.retry_interval = self.destination_max_retry_interval_ms
            else:
-                self.retry_interval = MIN_RETRY_INTERVAL
+                self.retry_interval = self.destination_min_retry_interval_ms

            logger.info(
                "Connection to %s was unsuccessful (%s(%s)); backoff now %i",
--- a/tests/storage/test_transactions.py
+++ b/tests/storage/test_transactions.py
@ -17,7 +17,6 @@ from twisted.test.proto_helpers import MemoryReactor
 from synapse.server import HomeServer
 from synapse.storage.databases.main.transactions import DestinationRetryTimings
 from synapse.util import Clock
-from synapse.util.retryutils import MAX_RETRY_INTERVAL

 from tests.unittest import HomeserverTestCase

@ -57,8 +56,14 @@ class TransactionStoreTestCase(HomeserverTestCase):
        self.get_success(d)

    def test_large_destination_retry(self) -> None:
+        max_retry_interval_ms = (
+            self.hs.config.federation.destination_max_retry_interval_ms
+        )
        d = self.store.set_destination_retry_timings(
-            "example.com", MAX_RETRY_INTERVAL, MAX_RETRY_INTERVAL, MAX_RETRY_INTERVAL
+            "example.com",
+            max_retry_interval_ms,
+            max_retry_interval_ms,
+            max_retry_interval_ms,
        )
        self.get_success(d)

--- a/tests/util/test_retryutils.py
+++ b/tests/util/test_retryutils.py
@ -11,12 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from synapse.util.retryutils import (
-    MIN_RETRY_INTERVAL,
-    RETRY_MULTIPLIER,
-    NotRetryingDestination,
-    get_retry_limiter,
-)
+from synapse.util.retryutils import NotRetryingDestination, get_retry_limiter

 from tests.unittest import HomeserverTestCase

@ -42,6 +37,11 @@ class RetryLimiterTestCase(HomeserverTestCase):

        limiter = self.get_success(get_retry_limiter("test_dest", self.clock, store))

+        min_retry_interval_ms = (
+            self.hs.config.federation.destination_min_retry_interval_ms
+        )
+        retry_multiplier = self.hs.config.federation.destination_retry_multiplier
+
        self.pump(1)
        try:
            with limiter:
@ -57,7 +57,7 @@ class RetryLimiterTestCase(HomeserverTestCase):
        assert new_timings is not None
        self.assertEqual(new_timings.failure_ts, failure_ts)
        self.assertEqual(new_timings.retry_last_ts, failure_ts)
-        self.assertEqual(new_timings.retry_interval, MIN_RETRY_INTERVAL)
+        self.assertEqual(new_timings.retry_interval, min_retry_interval_ms)

        # now if we try again we should get a failure
        self.get_failure(
@ -68,7 +68,7 @@ class RetryLimiterTestCase(HomeserverTestCase):
        # advance the clock and try again
        #

-        self.pump(MIN_RETRY_INTERVAL)
+        self.pump(min_retry_interval_ms)
        limiter = self.get_success(get_retry_limiter("test_dest", self.clock, store))

        self.pump(1)
@ -87,16 +87,16 @@ class RetryLimiterTestCase(HomeserverTestCase):
        self.assertEqual(new_timings.failure_ts, failure_ts)
        self.assertEqual(new_timings.retry_last_ts, retry_ts)
        self.assertGreaterEqual(
-            new_timings.retry_interval, MIN_RETRY_INTERVAL * RETRY_MULTIPLIER * 0.5
+            new_timings.retry_interval, min_retry_interval_ms * retry_multiplier * 0.5
        )
        self.assertLessEqual(
-            new_timings.retry_interval, MIN_RETRY_INTERVAL * RETRY_MULTIPLIER * 2.0
+            new_timings.retry_interval, min_retry_interval_ms * retry_multiplier * 2.0
        )

        #
        # one more go, with success
        #
-        self.reactor.advance(MIN_RETRY_INTERVAL * RETRY_MULTIPLIER * 2.0)
+        self.reactor.advance(min_retry_interval_ms * retry_multiplier * 2.0)
        limiter = self.get_success(get_retry_limiter("test_dest", self.clock, store))

        self.pump(1)
				`@ -0,0 +1 @@`
				`Allow for the configuration of the backoff algorithm for federation destinations.`