From f131bf8d3e934bb548615214239cfe131f8e33f5 Mon Sep 17 00:00:00 2001
From: Matthew Hodgson <matthew@matrix.org>
Date: Sun, 1 Jul 2018 05:08:51 +0100
Subject: [PATCH 1/4] don't mix unicode strings with utf8-in-byte-strings

otherwise we explode with:

```
Traceback (most recent call last):
  File /usr/lib/python2.7/logging/handlers.py, line 78, in emit
    logging.FileHandler.emit(self, record)
  File /usr/lib/python2.7/logging/__init__.py, line 950, in emit
    StreamHandler.emit(self, record)
  File /usr/lib/python2.7/logging/__init__.py, line 887, in emit
    self.handleError(record)
  File /usr/lib/python2.7/logging/__init__.py, line 810, in handleError
    None, sys.stderr)
  File /usr/lib/python2.7/traceback.py, line 124, in print_exception
    _print(file, 'Traceback (most recent call last):')
  File /usr/lib/python2.7/traceback.py, line 13, in _print
    file.write(str+terminator)
  File /home/matrix/.synapse/local/lib/python2.7/site-packages/twisted/logger/_io.py, line 170, in write
    self.log.emit(self.level, format=u{log_io}, log_io=line)
  File /home/matrix/.synapse/local/lib/python2.7/site-packages/twisted/logger/_logger.py, line 144, in emit
    self.observer(event)
  File /home/matrix/.synapse/local/lib/python2.7/site-packages/twisted/logger/_observer.py, line 136, in __call__
    errorLogger = self._errorLoggerForObserver(brokenObserver)
  File /home/matrix/.synapse/local/lib/python2.7/site-packages/twisted/logger/_observer.py, line 156, in _errorLoggerForObserver
    if obs is not observer
  File /home/matrix/.synapse/local/lib/python2.7/site-packages/twisted/logger/_observer.py, line 81, in __init__
    self.log = Logger(observer=self)
  File /home/matrix/.synapse/local/lib/python2.7/site-packages/twisted/logger/_logger.py, line 64, in __init__
    namespace = self._namespaceFromCallingContext()
  File /home/matrix/.synapse/local/lib/python2.7/site-packages/twisted/logger/_logger.py, line 42, in _namespaceFromCallingContext
    return currentframe(2).f_globals[__name__]
  File /home/matrix/.synapse/local/lib/python2.7/site-packages/twisted/python/compat.py, line 93, in currentframe
    for x in range(n + 1):
RuntimeError: maximum recursion depth exceeded while calling a Python object
Logged from file site.py, line 129
  File /usr/lib/python2.7/logging/__init__.py, line 859, in emit
    msg = self.format(record)
  File /usr/lib/python2.7/logging/__init__.py, line 732, in format
    return fmt.format(record)
  File /usr/lib/python2.7/logging/__init__.py, line 471, in format
    record.message = record.getMessage()
  File /usr/lib/python2.7/logging/__init__.py, line 335, in getMessage
    msg = msg % self.args
UnicodeDecodeError: 'ascii' codec can't decode byte 0xe2 in position 4: ordinal not in range(128)
Logged from file site.py, line 129
```

...where the logger apparently recurses whilst trying to log the error, hitting the
maximum recursion depth and killing everything badly.
---
 synapse/http/site.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/synapse/http/site.py b/synapse/http/site.py
index 74a752d6cf..2ab0d8ff78 100644
--- a/synapse/http/site.py
+++ b/synapse/http/site.py
@@ -113,7 +113,9 @@ class SynapseRequest(Request):
             " %sB %s \"%s %s %s\" \"%s\" [%d dbevts]",
             self.getClientIP(),
             self.site.site_tag,
-            self.authenticated_entity,
+            # need to decode as it could be raw utf-8 bytes
+            # from a IDN servname in an auth header
+            self.authenticated_entity.decode("utf-8"),
             end_time - self.start_time,
             ru_utime,
             ru_stime,
@@ -125,7 +127,12 @@ class SynapseRequest(Request):
             self.method,
             self.get_redacted_uri(),
             self.clientproto,
-            self.get_user_agent(),
+            # need to decode as could be raw utf-8 bytes
+            # from a utf-8 user-agent.
+            # N.B. if you don't do this, the logger explodes
+            # with maximum recursion trying to log errors about
+            # the charset problem.
+            self.get_user_agent().decode("utf-8"),
             evt_db_fetch_count,
         )
 

From 1c867f5391bff1b060290f834b177f4c339af65d Mon Sep 17 00:00:00 2001
From: Matthew Hodgson <matthew@matrix.org>
Date: Sun, 1 Jul 2018 11:56:33 +0100
Subject: [PATCH 2/4] a fix which doesn't NPE everywhere

---
 synapse/http/site.py | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/synapse/http/site.py b/synapse/http/site.py
index 2ab0d8ff78..14ea9c21c8 100644
--- a/synapse/http/site.py
+++ b/synapse/http/site.py
@@ -107,15 +107,28 @@ class SynapseRequest(Request):
 
         end_time = time.time()
 
+        # need to decode as it could be raw utf-8 bytes
+        # from a IDN servname in an auth header
+        authenticated_entity = self.authenticated_entity
+        if authenticated_entity is not None:
+            authenticated_entity = authenticated_entity.decode("utf-8")
+
+        # ...or could be raw utf-8 bytes in the User-Agent header.
+        # N.B. if you don't do this, the logger explodes cryptically
+        # with maximum recursion trying to log errors about
+        # the charset problem.
+        # c.f. https://github.com/matrix-org/synapse/issues/3471
+        user_agent = self.get_user_agent()
+        if user_agent is not None:
+            user_agent = user_agent.decode("utf-8")
+
         self.site.access_logger.info(
             "%s - %s - {%s}"
             " Processed request: %.3fsec (%.3fsec, %.3fsec) (%.3fsec/%.3fsec/%d)"
             " %sB %s \"%s %s %s\" \"%s\" [%d dbevts]",
             self.getClientIP(),
             self.site.site_tag,
-            # need to decode as it could be raw utf-8 bytes
-            # from a IDN servname in an auth header
-            self.authenticated_entity.decode("utf-8"),
+            authenticated_entity,
             end_time - self.start_time,
             ru_utime,
             ru_stime,
@@ -127,12 +140,7 @@ class SynapseRequest(Request):
             self.method,
             self.get_redacted_uri(),
             self.clientproto,
-            # need to decode as could be raw utf-8 bytes
-            # from a utf-8 user-agent.
-            # N.B. if you don't do this, the logger explodes
-            # with maximum recursion trying to log errors about
-            # the charset problem.
-            self.get_user_agent().decode("utf-8"),
+            user_agent,
             evt_db_fetch_count,
         )
 

From fc4f8f33be7bcfa8fca5d1aa1f07a1507e8d56e7 Mon Sep 17 00:00:00 2001
From: Matthew Hodgson <matthew@matrix.org>
Date: Mon, 2 Jul 2018 11:33:02 +0100
Subject: [PATCH 3/4] replace invalid utf8 with \ufffd

---
 synapse/http/site.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/synapse/http/site.py b/synapse/http/site.py
index 14ea9c21c8..fe93643b1e 100644
--- a/synapse/http/site.py
+++ b/synapse/http/site.py
@@ -111,7 +111,7 @@ class SynapseRequest(Request):
         # from a IDN servname in an auth header
         authenticated_entity = self.authenticated_entity
         if authenticated_entity is not None:
-            authenticated_entity = authenticated_entity.decode("utf-8")
+            authenticated_entity = authenticated_entity.decode("utf-8", "replace")
 
         # ...or could be raw utf-8 bytes in the User-Agent header.
         # N.B. if you don't do this, the logger explodes cryptically
@@ -120,7 +120,7 @@ class SynapseRequest(Request):
         # c.f. https://github.com/matrix-org/synapse/issues/3471
         user_agent = self.get_user_agent()
         if user_agent is not None:
-            user_agent = user_agent.decode("utf-8")
+            user_agent = user_agent.decode("utf-8", "replace")
 
         self.site.access_logger.info(
             "%s - %s - {%s}"

From 6ec3aa2f72b2a0cc8d58a2448de7d7af5ed5f05e Mon Sep 17 00:00:00 2001
From: Matthew Hodgson <matthew@matrix.org>
Date: Mon, 2 Jul 2018 13:43:34 +0100
Subject: [PATCH 4/4] news snippet

---
 changelog.d/3470.bugfix | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 changelog.d/3470.bugfix

diff --git a/changelog.d/3470.bugfix b/changelog.d/3470.bugfix
new file mode 100644
index 0000000000..1308931191
--- /dev/null
+++ b/changelog.d/3470.bugfix
@@ -0,0 +1 @@
+Fix bug where synapse would explode when receiving unicode in HTTP User-Agent header