summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAnton Khirnov <anton@khirnov.net>2018-11-29 20:05:57 +0100
committerAnton Khirnov <anton@khirnov.net>2020-02-03 13:00:02 +0100
commit11c5995ff803850d41eb93e5bfe7488722b11892 (patch)
treeffca85c217316c434829e7c4c381e6c85974de4c
parentc96eb5766496a050cea2f350bf5ef24b066888cd (diff)
Let python decode the message transfer encoding.
-rw-r--r--alot/db/utils.py57
-rw-r--r--tests/db/test_utils.py24
2 files changed, 14 insertions, 67 deletions
diff --git a/alot/db/utils.py b/alot/db/utils.py
index 97406e3a..86c4718b 100644
--- a/alot/db/utils.py
+++ b/alot/db/utils.py
@@ -398,64 +398,25 @@ def remove_cte(part, as_string=False):
:returns: The mail with any Content-Transfer-Encoding removed
:rtype: Union[str, bytes]
"""
- enc = part.get_content_charset() or 'ascii'
- cte = str(part.get('content-transfer-encoding', '7bit')).lower().strip()
- payload = part.get_payload()
- sp = '' # string variant of return value
- bp = b'' # bytestring variant
-
- logging.debug('Content-Transfer-Encoding: "{}"'.format(cte))
- if cte not in ['quoted-printable', 'base64', '7bit', '8bit', 'binary']:
- logging.info('Unknown Content-Transfer-Encoding: "{}"'.format(cte))
-
- # switch through all sensible cases
- # starting with those where payload is already a str
- if '7bit' in cte or 'binary' in cte:
- logging.debug('assuming Content-Transfer-Encoding: 7bit')
- sp = payload
- if as_string:
- return sp
- bp = payload.encode('utf-8')
- return bp
-
- # the remaining cases need decoding and define only bt;
- # decoding into a str is done at the end if requested
- elif '8bit' in cte:
- logging.debug('assuming Content-Transfer-Encoding: 8bit')
- # Python's mail library may decode 8bit as raw-unicode-escape, so
- # we need to encode that back to bytes so we can decode it using
- # the correct encoding, or it might not, in which case assume that
- # the str representation we got is correct.
- bp = payload.encode('raw-unicode-escape')
-
- elif 'quoted-printable' in cte:
- logging.debug('assuming Content-Transfer-Encoding: quoted-printable')
- bp = quopri.decodestring(payload.encode('ascii'))
-
- elif 'base64' in cte:
- logging.debug('assuming Content-Transfer-Encoding: base64')
- bp = base64.b64decode(payload)
-
- else:
- logging.debug('failed to interpret Content-Transfer-Encoding: '
- '"{}"'.format(cte))
-
- # by now, bp is defined, sp is not.
+ payload = part.get_payload(decode = True)
if as_string:
+ enc = part.get_content_charset('ascii')
+ if enc.startswith('windows-'):
+ enc = enc.replace('windows-', 'cp', 1)
+
try:
- sp = bp.decode(enc)
+ payload = payload.decode(enc, errors = 'backslashreplace')
except LookupError:
# enc is unknown;
# fall back to guessing the correct encoding using libmagic
- sp = helper.try_decode(bp)
+ payload = helper.try_decode(payload)
except UnicodeDecodeError as emsg:
# the mail contains chars that are not enc-encoded.
# libmagic works better than just ignoring those
logging.debug('Decoding failure: {}'.format(emsg))
- sp = helper.try_decode(bp)
- return sp
- return bp
+ payload = helper.try_decode(payload)
+ return payload
MISSING_HTML_MSG = ("This message contains a text/html part that was not "
"rendered due to a missing mailcap entry. "
diff --git a/tests/db/test_utils.py b/tests/db/test_utils.py
index 98a8247c..5622ff4b 100644
--- a/tests/db/test_utils.py
+++ b/tests/db/test_utils.py
@@ -732,35 +732,21 @@ class TestRemoveCte(unittest.TestCase):
with open('tests/static/mail/broken-utf8.eml') as fp:
mail = email.message_from_file(fp)
# This should not raise an UnicodeDecodeError.
- with self.assertLogs(level='DEBUG') as cm: # keep logs
- utils.remove_cte(mail, as_string=True)
- # We expect no Exceptions but a complaint in the log
- logmsg = 'DEBUG:root:Decoding failure: \'utf-8\' codec can\'t decode '\
- 'byte 0xa1 in position 14: invalid start byte'
- self.assertIn(logmsg, cm.output)
+ payload = utils.remove_cte(mail, as_string=True)
+ expected = '¡This works!\n\\xa1This doesn\'t!\n'
+ self.assertEqual(payload, expected)
def test_malformed_cte_value(self):
with open('tests/static/mail/malformed-header-CTE.eml') as fp:
mail = email.message_from_file(fp)
- with self.assertLogs(level='INFO') as cm: # keep logs
- utils.remove_cte(mail, as_string=True)
-
- # We expect no Exceptions but a complaint in the log
- logmsg = 'INFO:root:Unknown Content-Transfer-Encoding: "7bit;"'
- self.assertEqual(cm.output, [logmsg])
+ payload = utils.remove_cte(mail, as_string=True)
def test_unknown_cte_value(self):
with open('tests/static/mail/malformed-header-CTE-2.eml') as fp:
mail = email.message_from_file(fp)
- with self.assertLogs(level='DEBUG') as cm: # keep logs
- utils.remove_cte(mail, as_string=True)
-
- # We expect no Exceptions but a complaint in the log
- logmsg = 'DEBUG:root:failed to interpret Content-Transfer-Encoding: '\
- '"normal"'
- self.assertIn(logmsg, cm.output)
+ payload = utils.remove_cte(mail, as_string=True)
class Test_ensure_unique_address(unittest.TestCase):