summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAnton Khirnov <anton@khirnov.net>2023-08-23 12:34:53 +0200
committerAnton Khirnov <anton@khirnov.net>2023-08-23 12:44:57 +0200
commit9e1c4456dee1a1799d54d43b6c09c55d8ddb7cb8 (patch)
tree2f1be0d70e77fd78cbe2e4fae1cd4b4534467b0e
parent7515515cb889c6bb8435c7eae923891d6b28dd8e (diff)
mail/envelope: improve the parse_template() algorithmHEADmaster
It currently assumes that the string to be parsed contains LF-separated lines. This assumption is in general wrong, because the string may be written by the user's text editor and is thus outside of our control and may contain anything. In addition, since emails should use CRLF line endings, it is arguably more correct for text editors to write CRLF to .eml files. E.g. recent versions of vim do exactly that by default. Change the parsing code to accept any of CR/LF/CRLF line endings.
-rw-r--r--alot/mail/envelope.py71
1 files changed, 51 insertions, 20 deletions
diff --git a/alot/mail/envelope.py b/alot/mail/envelope.py
index bd214568..aa2a66e2 100644
--- a/alot/mail/envelope.py
+++ b/alot/mail/envelope.py
@@ -408,24 +408,57 @@ class Envelope:
if reset:
self.headers.clear()
- headerEndPos = 0
- if not only_body:
- # go through multiline, utf-8 encoded headers
- # locally, lines are separated by a simple LF, not CRLF
- # we decode the edited text ourselves here as
- # email.message_from_file can't deal with raw utf8 header values
- headerRe = re.compile(r'^(?P<k>.+?):(?P<v>(.|\n[ \t\r\f\v])+)$',
- re.MULTILINE)
- for header in headerRe.finditer(raw):
- if header.start() > headerEndPos + 1:
- break # switched to body
-
- key = header.group('k')
- # simple unfolding as decribed in
- # https://tools.ietf.org/html/rfc2822#section-2.2.3
- unfoldedValue = header.group('v').replace('\n', '')
- self.add(key, unfoldedValue.strip())
- headerEndPos = header.end()
+ if only_body:
+ self.body = raw
+ else:
+ # Split the raw string into headers and body.
+ # The string should be in "pseudo-email" format: a sequence of
+ # (possibly folded) headers, followed by an empty line, followed by
+ # the body. Since it may come from the user's text editor, we try to
+ # be lenient in parsing it.
+ #
+ # The differences from the real email format are:
+ # - line breaks do not have to be CRLF as per the RFC, but can be
+ # whatever the user's text editor wrote;
+ # - all text is a plain Unicode string, with no email encoding
+ # applied.
+ # - Attach headers are interpreted as instruction to us to attach
+ # the specified files.
+ # Since the email package cannot parse this (FIXME: might actually
+ # be possible with a custom policy - check this), we do it manually
+ # ourselves.
+
+ # Use bytes.splitlines(), so that only ASCII CR, LF, or CRLF are
+ # considered. str.splitlines() would also split on various Unicode
+ # linebreaks which we probably? want to preserve.
+ lines = list(map(lambda l: l.decode('utf-8'),
+ raw.encode('utf-8').splitlines()))
+
+ # list of the last seen [header name, header value], for unfolding
+ prev_header = None
+ for i, l in enumerate(lines):
+ if l and l[0] in ' \t' and prev_header:
+ # continuation of a folded header
+ prev_header[1] += l
+ elif re.match('[!-9;-~]+:', l):
+ # beginning of a new header
+ # as per RFC5322 2.2, header names are ASCII chars 33-126
+ # except colon
+ if prev_header:
+ self.add(*prev_header)
+
+ prev_header = l.split(':', maxsplit = 1)
+ else:
+ # anything else is assumed to start the body
+
+ # skip the empty line separating headers from the body,
+ # if present
+ idx = i if (len(l) > 0 or i == 0) else i + 1
+ self.body = '\n'.join(lines[idx:])
+ break
+
+ if prev_header:
+ self.add(*prev_header)
# interpret 'Attach' pseudo header
if 'Attach' in self:
@@ -439,8 +472,6 @@ class Envelope:
self.attach_file(path)
del self['Attach']
- self.body = raw[headerEndPos:].strip()
-
_MAILTO_PREFIX = 'mailto:'
_MAILTO_SAFE_HEADERS = (HDR.SUBJECT, HDR.CC, HDR.KEYWORDS)