aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCarl Worth <cworth@cworth.org>2009-10-19 12:54:40 -0700
committerCarl Worth <cworth@cworth.org>2009-10-19 13:00:43 -0700
commit0e777a8f800af062aba39a95a003f3e1d8f33793 (patch)
tree9d1c73931983ad78f60d866185ea07907782b29f
parent9bc4253fa804b62ff31e8de82a139b2cb12b118f (diff)
notmuch: Switch from gmime to custom, ad-hoc parsing of headers.
Since we're currently just trying to stitch together In-Reply-To and References headers we don't need that much sophistication. It's when we later add full-text searching that GMime will be useful. So for now, even though my own code here is surely very buggy compared to GMime it's also a lot faster. And speed is what we're after for the initial index creation.
-rw-r--r--Makefile6
-rw-r--r--database.cc183
-rw-r--r--message.c300
-rw-r--r--message.h19
-rw-r--r--notmuch-private.h81
5 files changed, 534 insertions, 55 deletions
diff --git a/Makefile b/Makefile
index 4af5a2e..b7ebfb8 100644
--- a/Makefile
+++ b/Makefile
@@ -1,9 +1,9 @@
PROGS=notmuch notmuch-index-message xapian-dump
-MYCFLAGS=-Wall -O0 -g `pkg-config --cflags gmime-2.4`
+MYCFLAGS=-Wall -O0 -g `pkg-config --cflags glib-2.0`
MYCXXFLAGS=$(MYCFLAGS) `xapian-config --cxxflags`
-MYLDFLAGS=`pkg-config --libs gmime-2.4` `xapian-config --libs`
+MYLDFLAGS=`pkg-config --libs glib-2.0` `xapian-config --libs`
all: $(PROGS)
@@ -13,7 +13,7 @@ all: $(PROGS)
%.o: %.c
$(CC) -c $(CFLAGS) $(MYCFLAGS) $^ -o $@
-notmuch: notmuch.o database.o xutil.o
+notmuch: notmuch.o database.o message.o xutil.o
$(CC) $(MYLDFLAGS) $^ -o $@
notmuch-index-message: notmuch-index-message.cc
diff --git a/database.cc b/database.cc
index 36b1b58..7ea1f41 100644
--- a/database.cc
+++ b/database.cc
@@ -20,20 +20,12 @@
#include "notmuch-private.h"
-#include <stdio.h>
-#include <errno.h>
-#include <time.h>
-#include <unistd.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/time.h>
-
#include <iostream>
-#include <gmime/gmime.h>
-
#include <xapian.h>
+#include <glib.h>
+
using namespace std;
struct _notmuch_database {
@@ -278,26 +270,113 @@ find_thread_ids (Xapian::Database *db,
return result;
}
-/* Add a term for each message-id in the References header of the
- * message. */
+/* Advance 'str' past any whitespace or RFC 822 comments. A comment is
+ * a (potentially nested) parenthesized sequence with '\' used to
+ * escape any character (including parentheses).
+ *
+ * If the sequence to be skipped continues to the end of the string,
+ * then 'str' will be left pointing at the final terminating '\0'
+ * character.
+ */
+static void
+skip_space_and_comments (const char **str)
+{
+ const char *s;
+
+ s = *str;
+ while (*s && (isspace (*s) || *s == '(')) {
+ while (*s && isspace (*s))
+ s++;
+ if (*s == '(') {
+ int nesting = 1;
+ s++;
+ while (*s && nesting) {
+ if (*s == '(')
+ nesting++;
+ else if (*s == ')')
+ nesting--;
+ else if (*s == '\\')
+ if (*(s+1))
+ s++;
+ s++;
+ }
+ }
+ }
+
+ *str = s;
+}
+
+/* Parse an RFC 822 message-id, discarding whitespace, any RFC 822
+ * comments, and the '<' and '>' delimeters.
+ *
+ * If not NULL, then *next will be made to point to the first character
+ * not parsed, (possibly pointing to the final '\0' terminator.
+ *
+ * Returns a newly allocated string which the caller should free()
+ * when done with it.
+ *
+ * Returns NULL if there is any error parsing the message-id. */
+static char *
+parse_message_id (const char *message_id, const char **next)
+{
+ const char *s, *end;
+
+ if (message_id == NULL)
+ return NULL;
+
+ s = message_id;
+
+ skip_space_and_comments (&s);
+
+ /* Skip any unstructured text as well. */
+ while (*s && *s != '<')
+ s++;
+
+ if (*s == '<') {
+ s++;
+ } else {
+ if (next)
+ *next = s;
+ return NULL;
+ }
+
+ skip_space_and_comments (&s);
+
+ end = s;
+ while (*end && *end != '>')
+ end++;
+ if (next) {
+ if (*end)
+ *next = end + 1;
+ else
+ *next = end;
+ }
+
+ if (end > s && *end == '>')
+ end--;
+ if (end > s)
+ return strndup (s, end - s + 1);
+ else
+ return NULL;
+}
+
+/* Parse a References header value, putting a copy of each referenced
+ * message-id into 'array'. */
static void
parse_references (GPtrArray *array,
- const char *refs_str)
+ const char *refs)
{
- GMimeReferences *refs, *r;
- const char *message_id;
+ char *ref;
- if (refs_str == NULL)
+ if (refs == NULL)
return;
- refs = g_mime_references_decode (refs_str);
+ while (*refs) {
+ ref = parse_message_id (refs, &refs);
- for (r = refs; r; r = r->next) {
- message_id = g_mime_references_get_message_id (r);
- g_ptr_array_add (array, g_strdup (message_id));
+ if (ref)
+ g_ptr_array_add (array, ref);
}
-
- g_mime_references_free (refs);
}
notmuch_database_t *
@@ -344,8 +423,6 @@ notmuch_database_open (const char *path)
struct stat st;
int err;
- g_mime_init (0);
-
notmuch_path = g_strdup_printf ("%s/%s", path, ".notmuch");
err = stat (notmuch_path, &st);
@@ -397,31 +474,17 @@ notmuch_database_add_message (notmuch_database_t *notmuch,
{
Xapian::WritableDatabase *db = notmuch->xapian_db;
Xapian::Document doc;
+ notmuch_message_t *message;
- GMimeStream *stream;
- GMimeParser *parser;
- GMimeMessage *message;
GPtrArray *parents, *thread_ids;
- FILE *file;
-
- const char *refs, *in_reply_to;
- const char *message_id;
+ const char *refs, *in_reply_to, *date, *header;
+ char *message_id;
- time_t time;
+ time_t time_value;
unsigned int i;
- file = fopen (filename, "r");
- if (! file) {
- fprintf (stderr, "Error opening %s: %s\n", filename, strerror (errno));
- exit (1);
- }
-
- stream = g_mime_stream_file_new (file);
-
- parser = g_mime_parser_new_with_stream (stream);
-
- message = g_mime_parser_construct_message (parser);
+ message = notmuch_message_open (filename);
try {
doc = Xapian::Document ();
@@ -430,16 +493,27 @@ notmuch_database_add_message (notmuch_database_t *notmuch,
parents = g_ptr_array_new ();
- refs = g_mime_object_get_header (GMIME_OBJECT (message), "references");
+ refs = notmuch_message_get_header (message, "references");
parse_references (parents, refs);
- in_reply_to = g_mime_object_get_header (GMIME_OBJECT (message),
- "in-reply-to");
+ in_reply_to = notmuch_message_get_header (message, "in-reply-to");
parse_references (parents, in_reply_to);
+
for (i = 0; i < parents->len; i++)
add_term (doc, "ref", (char *) g_ptr_array_index (parents, i));
- message_id = g_mime_message_get_message_id (message);
+ header = notmuch_message_get_header (message, "message-id");
+ if (header) {
+ message_id = parse_message_id (header, NULL);
+ /* So the header value isn't RFC-compliant, but it's
+ * better than no message-id at all. */
+ if (message_id == NULL)
+ message_id = xstrdup (header);
+ } else {
+ /* XXX: Should generate a message_id here, (such as a SHA1
+ * sum of the message itself) */
+ message_id = NULL;
+ }
thread_ids = find_thread_ids (db, parents, message_id);
@@ -478,8 +552,15 @@ notmuch_database_add_message (notmuch_database_t *notmuch,
doc.add_value (NOTMUCH_VALUE_THREAD, thread_id.str);
}
- g_mime_message_get_date (message, &time, NULL);
- doc.add_value (NOTMUCH_VALUE_DATE, Xapian::sortable_serialise (time));
+ free (message_id);
+
+/*
+ date = notmuch_message_get_header (message, "date");
+ time_value = notmuch_parse_date (date, NULL);
+
+ doc.add_value (NOTMUCH_VALUE_DATE,
+ Xapian::sortable_serialise (time_value));
+*/
db->add_document (doc);
} catch (const Xapian::Error &error) {
@@ -488,9 +569,7 @@ notmuch_database_add_message (notmuch_database_t *notmuch,
return NOTMUCH_STATUS_XAPIAN_EXCEPTION;
}
- g_object_unref (message);
- g_object_unref (parser);
- g_object_unref (stream);
+ notmuch_message_close (message);
return NOTMUCH_STATUS_SUCCESS;
}
diff --git a/message.c b/message.c
new file mode 100644
index 0000000..ea5d239
--- /dev/null
+++ b/message.c
@@ -0,0 +1,300 @@
+/* message.c - Utility functions for parsing an email message for notmuch.
+ *
+ * Copyright © 2009 Carl Worth
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see http://www.gnu.org/licenses/ .
+ *
+ * Author: Carl Worth <cworth@cworth.org>
+ */
+
+#include <stdarg.h>
+
+#include "notmuch-private.h"
+
+#include <glib.h>
+
+struct _notmuch_message {
+ /* File objects */
+ int fd;
+ void *map;
+
+ /* Header storage */
+ int restrict_headers;
+ GHashTable *headers;
+
+ /* Parsing state */
+ char *start;
+ size_t size;
+ const char *next_line;
+ int parsing_started;
+ int parsing_finished;
+};
+
+static int
+strcase_equal (const void *a, const void *b)
+{
+ return strcasecmp (a, b) == 0;
+}
+
+static unsigned int
+strcase_hash (const void *ptr)
+{
+ const char *s = ptr;
+
+ /* This is the djb2 hash. */
+ unsigned int hash = 5381;
+ while (s && *s) {
+ hash = ((hash << 5) + hash) + tolower (*s);
+ s++;
+ }
+
+ return hash;
+}
+
+notmuch_message_t *
+notmuch_message_open (const char *filename)
+{
+ notmuch_message_t *message;
+ struct stat st;
+
+ message = xcalloc (1, sizeof (notmuch_message_t));
+
+ message->fd = open (filename, O_RDONLY);
+ if (message->fd < 0)
+ goto FAIL;
+
+ if (fstat (message->fd, &st) < 0)
+ goto FAIL;
+
+ message->map = mmap (NULL, st.st_size, PROT_READ, MAP_PRIVATE,
+ message->fd, 0);
+ if (message->map == MAP_FAILED)
+ goto FAIL;
+
+ message->headers = g_hash_table_new_full (strcase_hash,
+ strcase_equal,
+ free,
+ free);
+
+ message->start = (char *) message->map;
+ message->size = st.st_size;
+ message->next_line = message->start;
+ message->parsing_started = 0;
+ message->parsing_finished = 0;
+
+ return message;
+
+ FAIL:
+ fprintf (stderr, "Error opening %s: %s\n", filename, strerror (errno));
+ notmuch_message_close (message);
+
+ return NULL;
+}
+
+void
+notmuch_message_close (notmuch_message_t *message)
+{
+ if (message == NULL)
+ return;
+
+ if (message->headers)
+ g_hash_table_unref (message->headers);
+
+ if (message->map)
+ munmap (message->map, message->size);
+ if (message->fd)
+ close (message->fd);
+
+ free (message);
+}
+
+void
+notmuch_message_restrict_headersv (notmuch_message_t *message,
+ va_list va_headers)
+{
+ char *header;
+
+ if (message->parsing_started ) {
+ fprintf (stderr, "Error: notmuch_message_restrict_headers called after parsing has started\n");
+ exit (1);
+ }
+
+ while (1) {
+ header = va_arg (va_headers, char*);
+ if (header == NULL)
+ break;
+ g_hash_table_insert (message->headers,
+ xstrdup (header), NULL);
+ }
+
+ message->restrict_headers = 1;
+}
+
+void
+notmuch_message_restrict_headers (notmuch_message_t *message, ...)
+{
+ va_list va_headers;
+
+ va_start (va_headers, message);
+
+ notmuch_message_restrict_headersv (message, va_headers);
+}
+
+/* With our mmapped file, we don't get the benefit of terminated
+ * strings, so we can't use things like strchr(). We don't even know
+ * if there's a newline at the end of the file so we also have to be
+ * careful of that. Basically, every time we advance a pointer while
+ * parsing we must ensure we don't go beyond our buffer.
+ */
+#define WITHIN(s) (((s) - message->start) < (message->size -1))
+
+/* In each of the macros below, "without overrunning the buffer" means
+ * that the macro will never dereference a character beyond the end of
+ * the buffer. However, all of the macros may return a pointer
+ * pointing to the first character beyond the buffer. So callers
+ * should test with WITHIN before dereferencing the result. */
+
+/* Advance 'ptr' until pointing at a non-space character in the same
+ * line, (without overrunning the buffer) */
+#define SKIP_SPACE_IN_LINE(ptr) \
+ while (WITHIN (ptr) && (*(ptr) == ' ' || *(ptr) == '\t')) \
+ (ptr)++;
+
+/* Advance 'ptr' until pointing at a non-space character, (without
+ * overrunning the buffer) */
+#define SKIP_SPACE(ptr) \
+ while (WITHIN (ptr) && isspace(*(ptr))) \
+ (ptr)++;
+
+/* Advance 'ptr' to the first occurrence of 'c' within the same
+ * line, (without overrunning the buffer). */
+#define ADVANCE_TO(ptr, c) \
+ while (WITHIN (ptr) && *(ptr) != '\n' && \
+ *(ptr) != (c)) \
+ { \
+ (ptr)++; \
+ }
+
+/* Advance 'ptr' to the beginning of the next line not starting with
+ * an initial tab character, (without overruning the buffer). */
+#define ADVANCE_TO_NEXT_HEADER_LINE(ptr) \
+ do { \
+ ADVANCE_TO ((ptr), '\n'); \
+ if (WITHIN (ptr)) \
+ (ptr)++; \
+ } while (WITHIN (ptr) && \
+ (*(ptr) == '\t' || *(ptr) == ' '));
+
+char *
+copy_header_value (const char *start, const char *end)
+{
+ const char *s;
+ char *result, *r;
+ int was_newline = 0;
+
+ result = xmalloc (end - start + 1);
+
+ s = start;
+ r = result;
+
+ while (s < end) {
+ if (*s == '\n') {
+ was_newline = 1;
+ } else {
+ if (*s == '\t' && was_newline)
+ *r = ' ';
+ else
+ *r = *s;
+ r++;
+ was_newline = 0;
+ }
+ s++;
+ }
+
+ *r = '\0';
+
+ return result;
+}
+
+const char *
+notmuch_message_get_header (notmuch_message_t *message,
+ const char *header_desired)
+{
+ int contains;
+ const char *s, *colon;
+ char *header, *value;
+ int match;
+
+ message->parsing_started = 1;
+
+ contains = g_hash_table_lookup_extended (message->headers,
+ header_desired, NULL,
+ (gpointer *) &value);
+ if (contains)
+ return value;
+
+ if (message->parsing_finished)
+ return NULL;
+
+ while (1) {
+ s = message->next_line;
+
+ if (*s == '\n') {
+ message->parsing_finished = 1;
+ return NULL;
+ }
+
+ if (*s == '\t') {
+ fprintf (stderr, "Warning: Unexpected continued value\n");
+ ADVANCE_TO_NEXT_HEADER_LINE (message->next_line);
+ continue;
+ }
+
+ colon = s;
+ ADVANCE_TO (colon, ':');
+
+ if (! WITHIN (colon) || *colon == '\n') {
+ fprintf (stderr, "Warning: Unexpected non-header line: %s\n", s);
+ ADVANCE_TO_NEXT_HEADER_LINE (message->next_line);
+ continue;
+ }
+
+ header = xstrndup (s, colon - s);
+
+ if (message->restrict_headers &&
+ ! g_hash_table_lookup_extended (message->headers,
+ header, NULL, NULL))
+ {
+ free (header);
+ message->next_line = colon;
+ ADVANCE_TO_NEXT_HEADER_LINE (message->next_line);
+ continue;
+ }
+
+ s = colon + 1;
+ SKIP_SPACE_IN_LINE (s);
+
+ message->next_line = s;
+ ADVANCE_TO_NEXT_HEADER_LINE (message->next_line);
+
+ value = copy_header_value (s, message->next_line);
+
+ match = (strcasecmp (header, header_desired) == 0);
+
+ g_hash_table_insert (message->headers, header, value);
+
+ if (match)
+ return value;
+ }
+}
diff --git a/message.h b/message.h
new file mode 100644
index 0000000..d0a34a1
--- /dev/null
+++ b/message.h
@@ -0,0 +1,19 @@
+/* message.h - Utility functions for parsing an email message for notmuch.
+ *
+ * Copyright © 2009 Carl Worth
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see http://www.gnu.org/licenses/ .
+ *
+ * Author: Carl Worth <cworth@cworth.org>
+ */
diff --git a/notmuch-private.h b/notmuch-private.h
index 15d6db4..b7d27e9 100644
--- a/notmuch-private.h
+++ b/notmuch-private.h
@@ -23,8 +23,17 @@
#include "notmuch.h"
+#include <stdio.h>
#include <stdlib.h>
+#include <stdarg.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
#include <string.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <ctype.h>
NOTMUCH_BEGIN_DECLS
@@ -44,6 +53,78 @@ xstrdup (const char *s);
char *
xstrndup (const char *s, size_t n);
+/* message.c */
+
+/* XXX: I haven't decided yet whether these will actually get exported
+ * into the public interface in notmuch.h
+ */
+
+typedef struct _notmuch_message notmuch_message_t;
+
+/* Open a file containing a single email message.
+ *
+ * The caller should call notmuch_message_close when done with this.
+ *
+ * Returns NULL if any error occurs.
+ */
+notmuch_message_t *
+notmuch_message_open (const char *filename);
+
+/* Close a notmuch message preivously opened with notmuch_message_open. */
+void
+notmuch_message_close (notmuch_message_t *message);
+
+/* Restrict 'message' to only save the named headers.
+ *
+ * When the caller is only interested in a short list of headers,
+ * known in advance, calling this function can avoid wasted time and
+ * memory parsing/saving header values that will never be needed.
+ *
+ * The variable arguments should be a list of const char * with a
+ * final '(const char *) NULL' to terminate the list.
+ *
+ * If this function is called, it must be called before any calls to
+ * notmuch_message_get_header for this message.
+ *
+ * After calling this function, if notmuch_message_get_header is
+ * called with a header name not in this list, then NULL will be
+ * returned even if that header exists in the actual message.
+ */
+void
+notmuch_message_restrict_headers (notmuch_message_t *message, ...);
+
+/* Identical to notmuch_message_restrict_headers but accepting a va_list. */
+void
+notmuch_message_restrict_headersv (notmuch_message_t *message,
+ va_list va_headers);
+
+/* Get the value of the specified header from the message.
+ *
+ * The header name is case insensitive.
+ *
+ * The returned value is owned by the notmuch message and is valid
+ * only until the message is closed. The caller should copy it if
+ * needing to modify the value or to hold onto it for longer.
+ *
+ * Returns NULL if the message does not contain a header line matching
+ * 'header'.
+ */
+const char *
+notmuch_message_get_header (notmuch_message_t *message,
+ const char *header);
+
+/* date.c */
+
+/* Parse an RFC 8222 date string to a time_t value.
+ *
+ * The tz_offset argument can be used to also obtain the time-zone
+ * offset, (but can be NULL if the call is not interested in that).
+ *
+ * Returns 0 on error.
+ */
+time_t
+notmuch_parse_date (const char *str, int *tz_offset);
+
NOTMUCH_END_DECLS
#endif