Add 'failure_ts' column to 'destinations' table (#6016)

Track the time that a server started failing at, for general analysis purposes.
2025-03-28 02:18:11 +00:00 · 2019-09-17 11:41:54 +01:00 · 2019-09-17 11:41:54 +01:00 · 1e19ce00bf
commit 1e19ce00bf
parent 850dcfd2d3
7 changed files with 195 additions and 12 deletions
--- a/changelog.d/6016.misc
+++ b/changelog.d/6016.misc
@ -0,0 +1 @@
+Add a 'failure_ts' column to the 'destinations' database table.
--- a/synapse/storage/schema/delta/56/destinations_failure_ts.sql
+++ b/synapse/storage/schema/delta/56/destinations_failure_ts.sql
@ -0,0 +1,25 @@
+/* Copyright 2019 The Matrix.org Foundation C.I.C
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Record the timestamp when a given server started failing
+ */
+ALTER TABLE destinations ADD failure_ts BIGINT;
+
+/* as a rough approximation, we assume that the server started failing at
+ * retry_interval before the last retry
+ */
+UPDATE destinations SET failure_ts = retry_last_ts - retry_interval
+    WHERE retry_last_ts > 0;
--- a/synapse/storage/transactions.py
+++ b/synapse/storage/transactions.py
@ -165,7 +165,7 @@ class TransactionStore(SQLBaseStore):
            txn,
            table="destinations",
            keyvalues={"destination": destination},
-            retcols=("destination", "retry_last_ts", "retry_interval"),
+            retcols=("destination", "failure_ts", "retry_last_ts", "retry_interval"),
            allow_none=True,
        )

@ -174,12 +174,15 @@ class TransactionStore(SQLBaseStore):
        else:
            return None

-    def set_destination_retry_timings(self, destination, retry_last_ts, retry_interval):
+    def set_destination_retry_timings(
+        self, destination, failure_ts, retry_last_ts, retry_interval
+    ):
        """Sets the current retry timings for a given destination.
        Both timings should be zero if retrying is no longer occuring.

        Args:
            destination (str)
+            failure_ts (int|None) - when the server started failing (ms since epoch)
            retry_last_ts (int) - time of last retry attempt in unix epoch ms
            retry_interval (int) - how long until next retry in ms
        """
@ -189,12 +192,13 @@ class TransactionStore(SQLBaseStore):
            "set_destination_retry_timings",
            self._set_destination_retry_timings,
            destination,
+            failure_ts,
            retry_last_ts,
            retry_interval,
        )

    def _set_destination_retry_timings(
-        self, txn, destination, retry_last_ts, retry_interval
+        self, txn, destination, failure_ts, retry_last_ts, retry_interval
    ):

        if self.database_engine.can_native_upsert:
@ -202,9 +206,12 @@ class TransactionStore(SQLBaseStore):
            # resetting it) or greater than the existing retry interval.

            sql = """
-                INSERT INTO destinations (destination, retry_last_ts, retry_interval)
-                    VALUES (?, ?, ?)
+                INSERT INTO destinations (
+                    destination, failure_ts, retry_last_ts, retry_interval
+                )
+                    VALUES (?, ?, ?, ?)
                ON CONFLICT (destination) DO UPDATE SET
+                        failure_ts = EXCLUDED.failure_ts,
                        retry_last_ts = EXCLUDED.retry_last_ts,
                        retry_interval = EXCLUDED.retry_interval
                    WHERE
@ -212,7 +219,7 @@ class TransactionStore(SQLBaseStore):
                        OR destinations.retry_interval < EXCLUDED.retry_interval
            """

-            txn.execute(sql, (destination, retry_last_ts, retry_interval))
+            txn.execute(sql, (destination, failure_ts, retry_last_ts, retry_interval))

            return

@ -225,7 +232,7 @@ class TransactionStore(SQLBaseStore):
            txn,
            table="destinations",
            keyvalues={"destination": destination},
-            retcols=("retry_last_ts", "retry_interval"),
+            retcols=("failure_ts", "retry_last_ts", "retry_interval"),
            allow_none=True,
        )

@ -235,6 +242,7 @@ class TransactionStore(SQLBaseStore):
                table="destinations",
                values={
                    "destination": destination,
+                    "failure_ts": failure_ts,
                    "retry_last_ts": retry_last_ts,
                    "retry_interval": retry_interval,
                },
@ -245,6 +253,7 @@ class TransactionStore(SQLBaseStore):
                "destinations",
                keyvalues={"destination": destination},
                updatevalues={
+                    "failure_ts": failure_ts,
                    "retry_last_ts": retry_last_ts,
                    "retry_interval": retry_interval,
                },
--- a/synapse/util/retryutils.py
+++ b/synapse/util/retryutils.py
@ -80,11 +80,13 @@ def get_retry_limiter(destination, clock, store, ignore_backoff=False, **kwargs)
            # We aren't ready to retry that destination.
            raise
    """
+    failure_ts = None
    retry_last_ts, retry_interval = (0, 0)

    retry_timings = yield store.get_destination_retry_timings(destination)

    if retry_timings:
+        failure_ts = retry_timings["failure_ts"]
        retry_last_ts, retry_interval = (
            retry_timings["retry_last_ts"],
            retry_timings["retry_interval"],
@ -108,6 +110,7 @@ def get_retry_limiter(destination, clock, store, ignore_backoff=False, **kwargs)
        destination,
        clock,
        store,
+        failure_ts,
        retry_interval,
        backoff_on_failure=backoff_on_failure,
        **kwargs
@ -120,6 +123,7 @@ class RetryDestinationLimiter(object):
        destination,
        clock,
        store,
+        failure_ts,
        retry_interval,
        backoff_on_404=False,
        backoff_on_failure=True,
@ -133,6 +137,8 @@ class RetryDestinationLimiter(object):
            destination (str)
            clock (Clock)
            store (DataStore)
+            failure_ts (int|None): when this destination started failing (in ms since
+                the epoch), or zero if the last request was successful
            retry_interval (int): The next retry interval taken from the
                database in milliseconds, or zero if the last request was
                successful.
@ -145,6 +151,7 @@ class RetryDestinationLimiter(object):
        self.store = store
        self.destination = destination

+        self.failure_ts = failure_ts
        self.retry_interval = retry_interval
        self.backoff_on_404 = backoff_on_404
        self.backoff_on_failure = backoff_on_failure
@ -186,6 +193,7 @@ class RetryDestinationLimiter(object):
            logger.debug(
                "Connection to %s was successful; clearing backoff", self.destination
            )
+            self.failure_ts = None
            retry_last_ts = 0
            self.retry_interval = 0
        elif not self.backoff_on_failure:
@ -211,11 +219,17 @@ class RetryDestinationLimiter(object):
            )
            retry_last_ts = int(self.clock.time_msec())

+            if self.failure_ts is None:
+                self.failure_ts = retry_last_ts
+
        @defer.inlineCallbacks
        def store_retry_timings():
            try:
                yield self.store.set_destination_retry_timings(
-                    self.destination, retry_last_ts, self.retry_interval
+                    self.destination,
+                    self.failure_ts,
+                    retry_last_ts,
+                    self.retry_interval,
                )
            except Exception:
                logger.exception("Failed to store destination_retry_timings")
--- a/tests/handlers/test_typing.py
+++ b/tests/handlers/test_typing.py
@ -99,7 +99,12 @@ class TypingNotificationsTestCase(unittest.HomeserverTestCase):
        self.event_source = hs.get_event_sources().sources["typing"]

        self.datastore = hs.get_datastore()
-        retry_timings_res = {"destination": "", "retry_last_ts": 0, "retry_interval": 0}
+        retry_timings_res = {
+            "destination": "",
+            "retry_last_ts": 0,
+            "retry_interval": 0,
+            "failure_ts": None,
+        }
        self.datastore.get_destination_retry_timings.return_value = defer.succeed(
            retry_timings_res
        )
--- a/tests/storage/test_transactions.py
+++ b/tests/storage/test_transactions.py
@ -29,17 +29,19 @@ class TransactionStoreTestCase(HomeserverTestCase):
        r = self.get_success(d)
        self.assertIsNone(r)

-        d = self.store.set_destination_retry_timings("example.com", 50, 100)
+        d = self.store.set_destination_retry_timings("example.com", 1000, 50, 100)
        self.get_success(d)

        d = self.store.get_destination_retry_timings("example.com")
        r = self.get_success(d)

-        self.assert_dict({"retry_last_ts": 50, "retry_interval": 100}, r)
+        self.assert_dict(
+            {"retry_last_ts": 50, "retry_interval": 100, "failure_ts": 1000}, r
+        )

    def test_initial_set_transactions(self):
        """Tests that we can successfully set the destination retries (there
        was a bug around invalidating the cache that broke this)
        """
-        d = self.store.set_destination_retry_timings("example.com", 50, 100)
+        d = self.store.set_destination_retry_timings("example.com", 1000, 50, 100)
        self.get_success(d)
--- a/tests/util/test_retryutils.py
+++ b/tests/util/test_retryutils.py
@ -0,0 +1,127 @@
+# -*- coding: utf-8 -*-
+# Copyright 2019 The Matrix.org Foundation C.I.C.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from synapse.util.retryutils import (
+    MIN_RETRY_INTERVAL,
+    RETRY_MULTIPLIER,
+    NotRetryingDestination,
+    get_retry_limiter,
+)
+
+from tests.unittest import HomeserverTestCase
+
+
+class RetryLimiterTestCase(HomeserverTestCase):
+    def test_new_destination(self):
+        """A happy-path case with a new destination and a successful operation"""
+        store = self.hs.get_datastore()
+        d = get_retry_limiter("test_dest", self.clock, store)
+        self.pump()
+        limiter = self.successResultOf(d)
+
+        # advance the clock a bit before making the request
+        self.pump(1)
+
+        with limiter:
+            pass
+
+        d = store.get_destination_retry_timings("test_dest")
+        self.pump()
+        new_timings = self.successResultOf(d)
+        self.assertIsNone(new_timings)
+
+    def test_limiter(self):
+        """General test case which walks through the process of a failing request"""
+        store = self.hs.get_datastore()
+
+        d = get_retry_limiter("test_dest", self.clock, store)
+        self.pump()
+        limiter = self.successResultOf(d)
+
+        self.pump(1)
+        try:
+            with limiter:
+                self.pump(1)
+                failure_ts = self.clock.time_msec()
+                raise AssertionError("argh")
+        except AssertionError:
+            pass
+
+        # wait for the update to land
+        self.pump()
+
+        d = store.get_destination_retry_timings("test_dest")
+        self.pump()
+        new_timings = self.successResultOf(d)
+        self.assertEqual(new_timings["failure_ts"], failure_ts)
+        self.assertEqual(new_timings["retry_last_ts"], failure_ts)
+        self.assertEqual(new_timings["retry_interval"], MIN_RETRY_INTERVAL)
+
+        # now if we try again we should get a failure
+        d = get_retry_limiter("test_dest", self.clock, store)
+        self.pump()
+        self.failureResultOf(d, NotRetryingDestination)
+
+        #
+        # advance the clock and try again
+        #
+
+        self.pump(MIN_RETRY_INTERVAL)
+        d = get_retry_limiter("test_dest", self.clock, store)
+        self.pump()
+        limiter = self.successResultOf(d)
+
+        self.pump(1)
+        try:
+            with limiter:
+                self.pump(1)
+                retry_ts = self.clock.time_msec()
+                raise AssertionError("argh")
+        except AssertionError:
+            pass
+
+        # wait for the update to land
+        self.pump()
+
+        d = store.get_destination_retry_timings("test_dest")
+        self.pump()
+        new_timings = self.successResultOf(d)
+        self.assertEqual(new_timings["failure_ts"], failure_ts)
+        self.assertEqual(new_timings["retry_last_ts"], retry_ts)
+        self.assertGreaterEqual(
+            new_timings["retry_interval"], MIN_RETRY_INTERVAL * RETRY_MULTIPLIER * 0.5
+        )
+        self.assertLessEqual(
+            new_timings["retry_interval"], MIN_RETRY_INTERVAL * RETRY_MULTIPLIER * 2.0
+        )
+
+        #
+        # one more go, with success
+        #
+        self.pump(MIN_RETRY_INTERVAL * RETRY_MULTIPLIER * 2.0)
+        d = get_retry_limiter("test_dest", self.clock, store)
+        self.pump()
+        limiter = self.successResultOf(d)
+
+        self.pump(1)
+        with limiter:
+            self.pump(1)
+
+        # wait for the update to land
+        self.pump()
+
+        d = store.get_destination_retry_timings("test_dest")
+        self.pump()
+        new_timings = self.successResultOf(d)
+        self.assertIsNone(new_timings)
				`@ -0,0 +1 @@`
				`Add a 'failure_ts' column to the 'destinations' database table.`