diff options
-rw-r--r-- | alot/mail/envelope.py | 71 |
1 files changed, 51 insertions, 20 deletions
diff --git a/alot/mail/envelope.py b/alot/mail/envelope.py index bd214568..aa2a66e2 100644 --- a/alot/mail/envelope.py +++ b/alot/mail/envelope.py @@ -408,24 +408,57 @@ class Envelope: if reset: self.headers.clear() - headerEndPos = 0 - if not only_body: - # go through multiline, utf-8 encoded headers - # locally, lines are separated by a simple LF, not CRLF - # we decode the edited text ourselves here as - # email.message_from_file can't deal with raw utf8 header values - headerRe = re.compile(r'^(?P<k>.+?):(?P<v>(.|\n[ \t\r\f\v])+)$', - re.MULTILINE) - for header in headerRe.finditer(raw): - if header.start() > headerEndPos + 1: - break # switched to body - - key = header.group('k') - # simple unfolding as decribed in - # https://tools.ietf.org/html/rfc2822#section-2.2.3 - unfoldedValue = header.group('v').replace('\n', '') - self.add(key, unfoldedValue.strip()) - headerEndPos = header.end() + if only_body: + self.body = raw + else: + # Split the raw string into headers and body. + # The string should be in "pseudo-email" format: a sequence of + # (possibly folded) headers, followed by an empty line, followed by + # the body. Since it may come from the user's text editor, we try to + # be lenient in parsing it. + # + # The differences from the real email format are: + # - line breaks do not have to be CRLF as per the RFC, but can be + # whatever the user's text editor wrote; + # - all text is a plain Unicode string, with no email encoding + # applied. + # - Attach headers are interpreted as instruction to us to attach + # the specified files. + # Since the email package cannot parse this (FIXME: might actually + # be possible with a custom policy - check this), we do it manually + # ourselves. + + # Use bytes.splitlines(), so that only ASCII CR, LF, or CRLF are + # considered. str.splitlines() would also split on various Unicode + # linebreaks which we probably? want to preserve. + lines = list(map(lambda l: l.decode('utf-8'), + raw.encode('utf-8').splitlines())) + + # list of the last seen [header name, header value], for unfolding + prev_header = None + for i, l in enumerate(lines): + if l and l[0] in ' \t' and prev_header: + # continuation of a folded header + prev_header[1] += l + elif re.match('[!-9;-~]+:', l): + # beginning of a new header + # as per RFC5322 2.2, header names are ASCII chars 33-126 + # except colon + if prev_header: + self.add(*prev_header) + + prev_header = l.split(':', maxsplit = 1) + else: + # anything else is assumed to start the body + + # skip the empty line separating headers from the body, + # if present + idx = i if (len(l) > 0 or i == 0) else i + 1 + self.body = '\n'.join(lines[idx:]) + break + + if prev_header: + self.add(*prev_header) # interpret 'Attach' pseudo header if 'Attach' in self: @@ -439,8 +472,6 @@ class Envelope: self.attach_file(path) del self['Attach'] - self.body = raw[headerEndPos:].strip() - _MAILTO_PREFIX = 'mailto:' _MAILTO_SAFE_HEADERS = (HDR.SUBJECT, HDR.CC, HDR.KEYWORDS) |