diff options
author | Carl Worth <cworth@cworth.org> | 2009-10-28 15:41:42 -0700 |
---|---|---|
committer | Carl Worth <cworth@cworth.org> | 2009-10-28 15:41:42 -0700 |
commit | 56218ddbb4a72fdec534773f2bd4e85aec914ae9 (patch) | |
tree | 2942e5d8707cb626cb1fef385f231f297279a6c7 /index.cc | |
parent | cfa228a3d4b300df3551e811028508d3de5cd81c (diff) |
index: Don't bother indexing quoted portions of messages (and signatures).
Our old notmuch-index-message.cc code had this, but I originally
left it out when adding indexing back in. I was concerned primarily
with mistakenly detecting signature markers and omitting important
text, (for example, I often do long lines of "----" as section
separators).
But now I see that there's a performance benefit to skippint the
quotations, (about 120 files/sec. instead of 95 files/sec.). I mitigated
the bogus signature checking by recognizing nothing other than the
all-time classic "-- ".
Diffstat (limited to 'index.cc')
-rw-r--r-- | index.cc | 56 |
1 files changed, 55 insertions, 1 deletions
@@ -135,6 +135,60 @@ skip_re_in_subject (const char *subject) return s; } +/* Given a string representing the body of a message, generate terms + * for it, (skipping quoted portions and signatures). + * + * This function is evil in that it modifies the string passed to it, + * (changing some newlines into '\0'). + */ +static void +_index_body_text (notmuch_message_t *message, char *body) +{ + char *line, *line_end, *next_line; + + if (body == NULL) + return; + + next_line = body; + + while (1) { + line = next_line; + if (*line == '\0') + break; + + next_line = strchr (line, '\n'); + if (next_line == NULL) { + next_line = line + strlen (line); + } + line_end = next_line - 1; + + /* Get to the next non-blank line. */ + while (*next_line == '\n') + next_line++; + + /* Skip blank lines. */ + if (line_end < line) + continue; + + /* Skip lines that are quotes. */ + if (*line == '>') + continue; + + /* Also skip lines introducing a quote on the next line. */ + if (*line_end == ':' && *next_line == '>') + continue; + + /* Finally, bail as soon as we see a signature. */ + /* XXX: Should only do this if "near" the end of the message. */ + if (strncmp (line, "-- ", 3) == 0) + break; + + *(line_end + 1) = '\0'; + + _notmuch_message_gen_terms (message, NULL, line); + } +} + /* Callback to generate terms for each mime part of a message. */ static void _index_mime_part (notmuch_message_t *message, @@ -207,7 +261,7 @@ _index_mime_part (notmuch_message_t *message, g_byte_array_append (byte_array, (guint8 *) "\0", 1); body = (char *) g_byte_array_free (byte_array, FALSE); - _notmuch_message_gen_terms (message, NULL, body); + _index_body_text (message, body); free (body); } |