summaryrefslogtreecommitdiff
path: root/alot
diff options
context:
space:
mode:
authorPatrick Totzke <patricktotzke@gmail.com>2018-12-04 21:52:01 +0000
committerPatrick Totzke <patricktotzke@gmail.com>2018-12-07 20:13:38 +0000
commit176cffcd8f8c7a7a6b7367781f1e7a7c5ebc9976 (patch)
tree34d86777029498a175c5e7789981622e693a5128 /alot
parentb6606a5402aa2c50d202e7feb27e74d2c7202681 (diff)
refactor alot.db.utils.remove_cte
This makes remove_cte more accepting of incorrect Content-Transfer-Encoding header values: instead of looking for exact matches of valid values, it now tests if a valid value appears as substring.
Diffstat (limited to 'alot')
-rw-r--r--alot/db/utils.py93
1 files changed, 60 insertions, 33 deletions
diff --git a/alot/db/utils.py b/alot/db/utils.py
index 1284098c..ea0963c1 100644
--- a/alot/db/utils.py
+++ b/alot/db/utils.py
@@ -375,10 +375,19 @@ def render_part(part, field_key='copiousoutput'):
def remove_cte(part, as_string=False):
- """Decodes any Content-Transfer-Encodings.
+ """Interpret MIME-part according to it's Content-Transfer-Encodings.
- Can return a string for display, or bytes to be passed to an external
- program.
+ This returns the payload of `part` as string or bytestring for display, or
+ to be passed to an external program. In the raw file the payload may be
+ encoded, e.g. in base64, quoted-printable, 7bit, or 8bit. This method will
+ look for one of the above Content-Transfer-Encoding header and interpret
+ the payload accordingly.
+
+ Incorrect header values (common in spam messages) will be interpreted as
+ lenient as possible and will result in INFO-level debug messages.
+
+ ..Note:: All this may be depricated in favour of
+ `email.contentmanager.raw_data_manager` (v3.6+)
:param email.Message part: The part to decode
:param bool as_string: If true return a str, otherwise return bytes
@@ -388,42 +397,60 @@ def remove_cte(part, as_string=False):
enc = part.get_content_charset() or 'ascii'
cte = str(part.get('content-transfer-encoding', '7bit')).lower().strip()
payload = part.get_payload()
- if cte == '8bit':
+ sp = '' # string variant of return value
+ bp = b'' # bytestring variant
+
+ logging.debug('Content-Transfer-Encoding: "{}"'.format(cte))
+ if cte not in ['quoted-printable', 'base64', '7bit', '8bit', 'binary']:
+ logging.info('Unknown Content-Transfer-Encoding: "{}"'.format(cte))
+
+ # switch through all sensible cases
+ # starting with those where payload is already a str
+ if '7bit' in cte or 'binary' in cte:
+ logging.debug('assuming Content-Transfer-Encoding: 7bit')
+ sp = payload
+ if as_string:
+ return sp
+ bp = payload.encode('utf-8')
+ return bp
+
+ # the remaining cases need decoding and define only bt;
+ # decoding into a str is done at the end if requested
+ elif '8bit' in cte:
+ logging.debug('assuming Content-Transfer-Encoding: 8bit')
# Python's mail library may decode 8bit as raw-unicode-escape, so
# we need to encode that back to bytes so we can decode it using
# the correct encoding, or it might not, in which case assume that
# the str representation we got is correct.
- raw_payload = payload.encode('raw-unicode-escape')
- if not as_string:
- return raw_payload
+ bp = payload.encode('raw-unicode-escape')
+
+ elif 'quoted-printable' in cte:
+ logging.debug('assuming Content-Transfer-Encoding: quoted-printable')
+ bp = quopri.decodestring(payload.encode('ascii'))
+
+ elif 'base64' in cte:
+ logging.debug('assuming Content-Transfer-Encoding: base64')
+ bp = base64.b64decode(payload)
+
+ else:
+ logging.debug('failed to interpret Content-Transfer-Encoding: '
+ '"{}"'.format(cte))
+
+ # by now, bp is defined, sp is not.
+ if as_string:
try:
- return raw_payload.decode(enc)
+ sp = bp.decode(enc)
except LookupError:
- # In this case the email has an unknown encoding, fall back to
- # guessing
- return helper.try_decode(raw_payload)
- except UnicodeDecodeError:
- if not as_string:
- return raw_payload
- return helper.try_decode(raw_payload)
- elif cte in ['7bit', 'binary']:
- if as_string:
- return payload
- return payload.encode('utf-8')
- else:
- if cte == 'quoted-printable':
- raw_payload = quopri.decodestring(payload.encode('ascii'))
- elif cte == 'base64':
- raw_payload = base64.b64decode(payload)
- else:
- raise ValueError(
- 'Unknown Content-Transfer-Encoding: "{}"'.format(cte))
- # message.get_payload(decode=True) also handles a number of unicode
- # encodindigs. maybe those are useful?
- if not as_string:
- return raw_payload
- return raw_payload.decode(enc)
- raise Exception('Unreachable')
+ # enc is unknown;
+ # fall back to guessing the correct encoding using libmagic
+ sp = helper.try_decode(bp)
+ except UnicodeDecodeError as emsg:
+ # the mail contains chars that are not enc-encoded.
+ # try again and just ignore those
+ logging.debug('Decoding failure: {}'.format(emsg))
+ sp = bp.decode(enc, errors='ignore')
+ return sp
+ return bp
def extract_body(mail, types=None, field_key='copiousoutput'):