diff options
author | Anton Khirnov <anton@khirnov.net> | 2018-11-29 20:05:57 +0100 |
---|---|---|
committer | Anton Khirnov <anton@khirnov.net> | 2020-02-03 13:00:02 +0100 |
commit | 11c5995ff803850d41eb93e5bfe7488722b11892 (patch) | |
tree | ffca85c217316c434829e7c4c381e6c85974de4c | |
parent | c96eb5766496a050cea2f350bf5ef24b066888cd (diff) |
Let python decode the message transfer encoding.
-rw-r--r-- | alot/db/utils.py | 57 | ||||
-rw-r--r-- | tests/db/test_utils.py | 24 |
2 files changed, 14 insertions, 67 deletions
diff --git a/alot/db/utils.py b/alot/db/utils.py index 97406e3a..86c4718b 100644 --- a/alot/db/utils.py +++ b/alot/db/utils.py @@ -398,64 +398,25 @@ def remove_cte(part, as_string=False): :returns: The mail with any Content-Transfer-Encoding removed :rtype: Union[str, bytes] """ - enc = part.get_content_charset() or 'ascii' - cte = str(part.get('content-transfer-encoding', '7bit')).lower().strip() - payload = part.get_payload() - sp = '' # string variant of return value - bp = b'' # bytestring variant - - logging.debug('Content-Transfer-Encoding: "{}"'.format(cte)) - if cte not in ['quoted-printable', 'base64', '7bit', '8bit', 'binary']: - logging.info('Unknown Content-Transfer-Encoding: "{}"'.format(cte)) - - # switch through all sensible cases - # starting with those where payload is already a str - if '7bit' in cte or 'binary' in cte: - logging.debug('assuming Content-Transfer-Encoding: 7bit') - sp = payload - if as_string: - return sp - bp = payload.encode('utf-8') - return bp - - # the remaining cases need decoding and define only bt; - # decoding into a str is done at the end if requested - elif '8bit' in cte: - logging.debug('assuming Content-Transfer-Encoding: 8bit') - # Python's mail library may decode 8bit as raw-unicode-escape, so - # we need to encode that back to bytes so we can decode it using - # the correct encoding, or it might not, in which case assume that - # the str representation we got is correct. - bp = payload.encode('raw-unicode-escape') - - elif 'quoted-printable' in cte: - logging.debug('assuming Content-Transfer-Encoding: quoted-printable') - bp = quopri.decodestring(payload.encode('ascii')) - - elif 'base64' in cte: - logging.debug('assuming Content-Transfer-Encoding: base64') - bp = base64.b64decode(payload) - - else: - logging.debug('failed to interpret Content-Transfer-Encoding: ' - '"{}"'.format(cte)) - - # by now, bp is defined, sp is not. + payload = part.get_payload(decode = True) if as_string: + enc = part.get_content_charset('ascii') + if enc.startswith('windows-'): + enc = enc.replace('windows-', 'cp', 1) + try: - sp = bp.decode(enc) + payload = payload.decode(enc, errors = 'backslashreplace') except LookupError: # enc is unknown; # fall back to guessing the correct encoding using libmagic - sp = helper.try_decode(bp) + payload = helper.try_decode(payload) except UnicodeDecodeError as emsg: # the mail contains chars that are not enc-encoded. # libmagic works better than just ignoring those logging.debug('Decoding failure: {}'.format(emsg)) - sp = helper.try_decode(bp) - return sp - return bp + payload = helper.try_decode(payload) + return payload MISSING_HTML_MSG = ("This message contains a text/html part that was not " "rendered due to a missing mailcap entry. " diff --git a/tests/db/test_utils.py b/tests/db/test_utils.py index 98a8247c..5622ff4b 100644 --- a/tests/db/test_utils.py +++ b/tests/db/test_utils.py @@ -732,35 +732,21 @@ class TestRemoveCte(unittest.TestCase): with open('tests/static/mail/broken-utf8.eml') as fp: mail = email.message_from_file(fp) # This should not raise an UnicodeDecodeError. - with self.assertLogs(level='DEBUG') as cm: # keep logs - utils.remove_cte(mail, as_string=True) - # We expect no Exceptions but a complaint in the log - logmsg = 'DEBUG:root:Decoding failure: \'utf-8\' codec can\'t decode '\ - 'byte 0xa1 in position 14: invalid start byte' - self.assertIn(logmsg, cm.output) + payload = utils.remove_cte(mail, as_string=True) + expected = '¡This works!\n\\xa1This doesn\'t!\n' + self.assertEqual(payload, expected) def test_malformed_cte_value(self): with open('tests/static/mail/malformed-header-CTE.eml') as fp: mail = email.message_from_file(fp) - with self.assertLogs(level='INFO') as cm: # keep logs - utils.remove_cte(mail, as_string=True) - - # We expect no Exceptions but a complaint in the log - logmsg = 'INFO:root:Unknown Content-Transfer-Encoding: "7bit;"' - self.assertEqual(cm.output, [logmsg]) + payload = utils.remove_cte(mail, as_string=True) def test_unknown_cte_value(self): with open('tests/static/mail/malformed-header-CTE-2.eml') as fp: mail = email.message_from_file(fp) - with self.assertLogs(level='DEBUG') as cm: # keep logs - utils.remove_cte(mail, as_string=True) - - # We expect no Exceptions but a complaint in the log - logmsg = 'DEBUG:root:failed to interpret Content-Transfer-Encoding: '\ - '"normal"' - self.assertIn(logmsg, cm.output) + payload = utils.remove_cte(mail, as_string=True) class Test_ensure_unique_address(unittest.TestCase): |